diff --git a/storage/ndb/src/kernel/blocks/ERROR_codes.txt b/storage/ndb/src/kernel/blocks/ERROR_codes.txt index 67eb89f850f..58479dfe7b6 100644 --- a/storage/ndb/src/kernel/blocks/ERROR_codes.txt +++ b/storage/ndb/src/kernel/blocks/ERROR_codes.txt @@ -5,7 +5,7 @@ Next DBACC 3002 Next DBTUP 4029 Next DBLQH 5045 Next DBDICT 6007 -Next DBDIH 7183 +Next DBDIH 7186 Next DBTC 8040 Next CMVMI 9000 Next BACKUP 10038 @@ -75,6 +75,10 @@ Delay GCP_SAVEREQ by 10 secs 7180: Crash master during master-take-over in execMASTER_LCPCONF +7184: Crash before starting next GCP after a node failure + +7185: Dont reply to COPY_GCI_REQ where reason == GCP + ERROR CODES FOR TESTING NODE FAILURE, LOCAL CHECKPOINT HANDLING: ----------------------------------------------------------------- diff --git a/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp index 4c060ff4e2b..6321679269d 100644 --- a/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp +++ b/storage/ndb/src/kernel/blocks/dbdih/Dbdih.hpp @@ -899,7 +899,7 @@ private: void ndbsttorry10Lab(Signal *, Uint32 _line); void createMutexes(Signal* signal, Uint32 no); void createMutex_done(Signal* signal, Uint32 no, Uint32 retVal); - void crashSystemAtGcpStop(Signal *); + void crashSystemAtGcpStop(Signal *, bool); void sendFirstDictfragsreq(Signal *, TabRecordPtr regTabPtr); void addtabrefuseLab(Signal *, ConnectRecordPtr regConnectPtr, Uint32 errorCode); void GCP_SAVEhandling(Signal *, Uint32 nodeId); diff --git a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index a2f8351e308..e731bde5917 100644 --- a/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/storage/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -747,6 +747,13 @@ done: } ndbrequire(ok); + + if (ERROR_INSERTED(7185) && reason==CopyGCIReq::GLOBAL_CHECKPOINT) + { + jam(); + return; + } + /* ----------------------------------------------------------------------- */ /* WE START BY TRYING TO OPEN THE FIRST RESTORABLE GCI FILE. */ /* ----------------------------------------------------------------------- */ @@ -4071,6 +4078,11 @@ void Dbdih::execNODE_FAILREP(Signal* signal) CLEAR_ERROR_INSERT_VALUE; } + if (ERROR_INSERTED(7184)) + { + SET_ERROR_INSERT_VALUE(7000); + } + /*-------------------------------------------------------------------------*/ // The first step is to convert from a bit mask to an array of failed nodes. /*-------------------------------------------------------------------------*/ @@ -7745,7 +7757,7 @@ void Dbdih::checkGcpStopLab(Signal* signal) g_eventLogger.error("System crash due to GCP Stop in state = %u", (Uint32) cgcpStatus); #endif - crashSystemAtGcpStop(signal); + crashSystemAtGcpStop(signal, false); return; }//if } else { @@ -7759,7 +7771,7 @@ void Dbdih::checkGcpStopLab(Signal* signal) g_eventLogger.error("System crash due to GCP Stop in state = %u", (Uint32) cgcpStatus); #endif - crashSystemAtGcpStop(signal); + crashSystemAtGcpStop(signal, false); return; }//if } else { @@ -11117,41 +11129,132 @@ void Dbdih::tableCloseLab(Signal* signal, FileRecordPtr filePtr) * GCP stop detected, * send SYSTEM_ERROR to all other alive nodes */ -void Dbdih::crashSystemAtGcpStop(Signal* signal) +void Dbdih::crashSystemAtGcpStop(Signal* signal, bool local) { + if (local) + goto dolocal; + switch(cgcpStatus){ + case GCP_PREPARE_SENT: + { + jam(); + /** + * We're waiting for a GCP PREPARE CONF + */ + infoEvent("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_PREPARE_Counter.getText()); + ndbout_c("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_PREPARE_Counter.getText()); + + { + NodeReceiverGroup rg(DBDIH, c_GCP_PREPARE_Counter); + signal->theData[0] = 7022; + sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA); + } + + { + NodeReceiverGroup rg(NDBCNTR, c_GCP_PREPARE_Counter); + SystemError * const sysErr = (SystemError*)&signal->theData[0]; + sysErr->errorCode = SystemError::GCPStopDetected; + sysErr->errorRef = reference(); + sysErr->data1 = cgcpStatus; + sysErr->data2 = cgcpOrderBlocked; + sendSignal(rg, GSN_SYSTEM_ERROR, signal, + SystemError::SignalLength, JBA); + } + ndbrequire(!c_GCP_PREPARE_Counter.done()); + return; + } + case GCP_COMMIT_SENT: + { + jam(); + /** + * We're waiting for a GCP_NODEFINISH + */ + infoEvent("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_COMMIT_Counter.getText()); + ndbout_c("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_COMMIT_Counter.getText()); + + { + NodeReceiverGroup rg(DBDIH, c_GCP_COMMIT_Counter); + signal->theData[0] = 7022; + sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA); + } + + { + NodeReceiverGroup rg(NDBCNTR, c_GCP_COMMIT_Counter); + SystemError * const sysErr = (SystemError*)&signal->theData[0]; + sysErr->errorCode = SystemError::GCPStopDetected; + sysErr->errorRef = reference(); + sysErr->data1 = cgcpStatus; + sysErr->data2 = cgcpOrderBlocked; + sendSignal(rg, GSN_SYSTEM_ERROR, signal, + SystemError::SignalLength, JBA); + } + ndbrequire(!c_GCP_COMMIT_Counter.done()); + return; + } case GCP_NODE_FINISHED: { + jam(); /** * We're waiting for a GCP save conf */ - ndbrequire(!c_GCP_SAVEREQ_Counter.done()); NodeReceiverGroup rg(DBLQH, c_GCP_SAVEREQ_Counter); signal->theData[0] = 2305; sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB); - infoEvent("Detected GCP stop...sending kill to %s", - c_GCP_SAVEREQ_Counter.getText()); - g_eventLogger.error("Detected GCP stop...sending kill to %s", - c_GCP_SAVEREQ_Counter.getText()); + infoEvent("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_SAVEREQ_Counter.getText()); + ndbout_c("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_GCP_SAVEREQ_Counter.getText()); + ndbrequire(!c_GCP_SAVEREQ_Counter.done()); return; } case GCP_SAVE_LQH_FINISHED: - g_eventLogger.error("m_copyReason: %d m_waiting: %d", - c_copyGCIMaster.m_copyReason, - c_copyGCIMaster.m_waiting); - break; - case GCP_READY: // shut up lint - case GCP_PREPARE_SENT: - case GCP_COMMIT_SENT: - break; + { + jam(); + /** + * We're waiting for a COPY_GCICONF + */ + infoEvent("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_COPY_GCIREQ_Counter.getText()); + ndbout_c("Detected GCP stop(%d)...sending kill to %s", + cgcpStatus, c_COPY_GCIREQ_Counter.getText()); + + { + NodeReceiverGroup rg(DBDIH, c_COPY_GCIREQ_Counter); + signal->theData[0] = 7022; + sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBA); + } + + { + NodeReceiverGroup rg(NDBCNTR, c_COPY_GCIREQ_Counter); + SystemError * const sysErr = (SystemError*)&signal->theData[0]; + sysErr->errorCode = SystemError::GCPStopDetected; + sysErr->errorRef = reference(); + sysErr->data1 = cgcpStatus; + sysErr->data2 = cgcpOrderBlocked; + sendSignal(rg, GSN_SYSTEM_ERROR, signal, + SystemError::SignalLength, JBA); + } + ndbrequire(!c_COPY_GCIREQ_Counter.done()); + return; } + case GCP_READY: (void)1; + } + +dolocal: + ndbout_c("m_copyReason: %d m_waiting: %d", + c_copyGCIMaster.m_copyReason, + c_copyGCIMaster.m_waiting); - g_eventLogger.error("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d", - c_copyGCISlave.m_senderData, - c_copyGCISlave.m_senderRef, - c_copyGCISlave.m_copyReason, - c_copyGCISlave.m_expectedNextWord); + ndbout_c("c_copyGCISlave: sender{Data, Ref} %d %x reason: %d nextWord: %d", + c_copyGCISlave.m_senderData, + c_copyGCISlave.m_senderRef, + c_copyGCISlave.m_copyReason, + c_copyGCISlave.m_expectedNextWord); FileRecordPtr file0Ptr; file0Ptr.i = crestartInfoFile[0]; @@ -11202,23 +11305,39 @@ void Dbdih::crashSystemAtGcpStop(Signal* signal) c_TCGETOPSIZEREQ_Counter.getText()); ndbout_c("c_UPDATE_TOREQ_Counter = %s", c_UPDATE_TOREQ_Counter.getText()); - NodeRecordPtr nodePtr; - for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { + if (local == false) + { jam(); - ptrAss(nodePtr, nodeRecord); - if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) { + NodeRecordPtr nodePtr; + for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { jam(); - const BlockReference ref = - numberToRef(refToBlock(cntrlblockref), nodePtr.i); - SystemError * const sysErr = (SystemError*)&signal->theData[0]; - sysErr->errorCode = SystemError::GCPStopDetected; - sysErr->errorRef = reference(); - sysErr->data1 = cgcpStatus; - sysErr->data2 = cgcpOrderBlocked; - sendSignal(ref, GSN_SYSTEM_ERROR, signal, - SystemError::SignalLength, JBA); - }//if - }//for + ptrAss(nodePtr, nodeRecord); + if (nodePtr.p->nodeStatus == NodeRecord::ALIVE) { + jam(); + const BlockReference ref = + numberToRef(refToBlock(cntrlblockref), nodePtr.i); + SystemError * const sysErr = (SystemError*)&signal->theData[0]; + sysErr->errorCode = SystemError::GCPStopDetected; + sysErr->errorRef = reference(); + sysErr->data1 = cgcpStatus; + sysErr->data2 = cgcpOrderBlocked; + sendSignal(ref, GSN_SYSTEM_ERROR, signal, + SystemError::SignalLength, JBA); + }//if + }//for + } + else + { + jam(); + SystemError * const sysErr = (SystemError*)&signal->theData[0]; + sysErr->errorCode = SystemError::GCPStopDetected; + sysErr->errorRef = reference(); + sysErr->data1 = cgcpStatus; + sysErr->data2 = cgcpOrderBlocked; + EXECUTE_DIRECT(NDBCNTR, GSN_SYSTEM_ERROR, + signal, SystemError::SignalLength); + ndbrequire(false); + } return; }//Dbdih::crashSystemAtGcpStop() @@ -14304,6 +14423,12 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) infoEvent(buf); } } + + if (arg == 7022) + { + jam(); + crashSystemAtGcpStop(signal, true); + } }//Dbdih::execDUMP_STATE_ORD() void