From c9147c5ecb121e7f2be951100ba91fe83ccb1a65 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 6 Feb 2006 11:30:56 +0100 Subject: [PATCH 1/5] bug#13965 ndb - error while restarting in dict improve error message when changed config leads to failed restart ndb/src/kernel/blocks/dbdict/Dbdict.cpp: improve error message --- ndb/src/kernel/blocks/dbdict/Dbdict.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp index 6564963f61a..6f1e9fee3be 100644 --- a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp +++ b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp @@ -2397,7 +2397,19 @@ Dbdict::restartCreateTab_readTableConf(Signal* signal, Uint32 sz = c_readTableRecord.noOfPages * ZSIZE_OF_PAGES_IN_WORDS; SimplePropertiesLinearReader r(&pageRecPtr.p->word[0], sz); handleTabInfoInit(r, &parseRecord); - ndbrequire(parseRecord.errorCode == 0); + if (parseRecord.errorCode != 0) + { + char buf[255]; + BaseString::snprintf(buf, sizeof(buf), + "Unable to restart, fail while creating table %d" + " error: %d. Most likely change of configution", + c_readTableRecord.tableId, + parseRecord.errorCode); + progError(__LINE__, + ERR_INVALID_CONFIG, + buf); + ndbrequire(parseRecord.errorCode == 0); + } /* ---------------------------------------------------------------- */ // We have read the table description from disk as part of system restart. From db4d82bad49a5c1b69c4b3134ad9c2f25b9e3221 Mon Sep 17 00:00:00 2001 From: unknown Date: Mon, 6 Feb 2006 11:42:44 +0100 Subject: [PATCH 2/5] bug#13966 - ndb better error message on invalid config change ndb/src/kernel/blocks/dbdict/Dbdict.cpp: fix typo ndb/src/kernel/blocks/dblqh/DblqhMain.cpp: Change error message...note that this is a guess --- ndb/src/kernel/blocks/dbdict/Dbdict.cpp | 2 +- ndb/src/kernel/blocks/dblqh/DblqhMain.cpp | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp index 6f1e9fee3be..2bb429aeabc 100644 --- a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp +++ b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp @@ -2402,7 +2402,7 @@ Dbdict::restartCreateTab_readTableConf(Signal* signal, char buf[255]; BaseString::snprintf(buf, sizeof(buf), "Unable to restart, fail while creating table %d" - " error: %d. Most likely change of configution", + " error: %d. Most likely change of configuration", c_readTableRecord.tableId, parseRecord.errorCode); progError(__LINE__, diff --git a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp index c8460630d62..80a8805697e 100644 --- a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp +++ b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp @@ -16096,8 +16096,22 @@ void Dblqh::findLogfile(Signal* signal, }//if locLogFilePtr.i = locLogFilePtr.p->nextLogFile; loopCount++; + if (loopCount >= flfLogPartPtr.p->noLogFiles && + getNodeState().startLevel != NodeState::SL_STARTED) + { + goto error; + } ndbrequire(loopCount < flfLogPartPtr.p->noLogFiles); }//while + +error: + char buf[255]; + BaseString::snprintf(buf, sizeof(buf), + "Unable to restart, failed while reading redo." + " Likely invalid change of configuration"); + progError(__LINE__, + ERR_INVALID_CONFIG, + buf); }//Dblqh::findLogfile() /* ------------------------------------------------------------------------- */ From bbcb4a567a298edf6c7a8846f4a57377702f5d9c Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 10 Feb 2006 09:17:53 +0100 Subject: [PATCH 3/5] bug#17295 - ndb - error while reading REDO log fix corruption due to page 0, file 0 gets released ndb/src/kernel/blocks/dblqh/DblqhMain.cpp: Make sure that page 0, file 0 isnt released --- ndb/src/kernel/blocks/dblqh/DblqhMain.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp index 80a8805697e..fb6aa026b3b 100644 --- a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp +++ b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp @@ -14767,7 +14767,9 @@ void Dblqh::execSr(Signal* signal) signal->theData[4] = logFilePtr.p->currentFilepage; signal->theData[5] = logFilePtr.p->currentMbyte; signal->theData[6] = logPagePtr.p->logPageWord[ZCURR_PAGE_INDEX]; - sendSignal(cownref, GSN_DEBUG_SIG, signal, 7, JBA); + signal->theData[7] = ~0; + signal->theData[8] = __LINE__; + sendSignal(cownref, GSN_DEBUG_SIG, signal, 9, JBA); return; }//if }//if @@ -14833,7 +14835,8 @@ void Dblqh::execSr(Signal* signal) signal->theData[5] = logFilePtr.p->currentFilepage; signal->theData[6] = logPagePtr.p->logPageWord[ZCURR_PAGE_INDEX]; signal->theData[7] = logWord; - sendSignal(cownref, GSN_DEBUG_SIG, signal, 8, JBA); + signal->theData[8] = __LINE__; + sendSignal(cownref, GSN_DEBUG_SIG, signal, 9, JBA); return; break; }//switch @@ -14862,8 +14865,9 @@ void Dblqh::execDEBUG_SIG(Signal* signal) char buf[100]; BaseString::snprintf(buf, 100, - "Error while reading REDO log.\n" + "Error while reading REDO log. from %d\n" "D=%d, F=%d Mb=%d FP=%d W1=%d W2=%d", + signal->theData[8], signal->theData[2], signal->theData[3], signal->theData[4], signal->theData[5], signal->theData[6], signal->theData[7]); @@ -15439,6 +15443,10 @@ void Dblqh::readSrFourthZeroLab(Signal* signal) // to read a page from file. lfoPtr.p->lfoState = LogFileOperationRecord::WRITE_SR_INVALIDATE_PAGES; + /** + * Make sure we dont release zero page + */ + seizeLogpage(signal); invalidateLogAfterLastGCI(signal); return; }//Dblqh::readSrFourthZeroLab() From 165d5390698c66217d0614afa097cb61bd859af1 Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 10 Feb 2006 09:37:36 +0100 Subject: [PATCH 4/5] bug#10987 - ndb - unable to find restorable replica Introduce new variable c_newest_restorable_gci which is set _after_ both GCP_SAVE and COPY_GCI This variable is used when cutting redo (calcKeepGci) Also make sure complete GCI is run inbetween LCP's ndb/src/kernel/blocks/dbdih/Dbdih.hpp: Introduce new variable c_newest_restorable_gci which is set _after_ both GCP_SAVE and COPY_GCI This variable is used when cutting redo (calcKeepGci) Also make sure complete GCI is run inbetween LCP's ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: Introduce new variable c_newest_restorable_gci which is set _after_ both GCP_SAVE and COPY_GCI This variable is used when cutting redo (calcKeepGci) Also make sure complete GCI is run inbetween LCP's --- ndb/src/kernel/blocks/dbdih/Dbdih.hpp | 7 +++--- ndb/src/kernel/blocks/dbdih/DbdihMain.cpp | 28 ++++++++++++++--------- 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp index ee67bf47d7b..0c107e35603 100644 --- a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp +++ b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp @@ -774,7 +774,7 @@ private: //------------------------------------ // Methods for LCP functionality //------------------------------------ - void checkKeepGci(Uint32 replicaStartIndex); + void checkKeepGci(TabRecordPtr, Uint32, Fragmentstore*, Uint32); void checkLcpStart(Signal *, Uint32 lineNo); void checkStartMoreLcp(Signal *, Uint32 nodeId); bool reportLcpCompletion(const class LcpFragRep *); @@ -1292,7 +1292,7 @@ private: } Uint32 lcpStart; - Uint32 lcpStartGcp; + Uint32 lcpStopGcp; Uint32 keepGci; /* USED TO CALCULATE THE GCI TO KEEP AFTER A LCP */ Uint32 oldestRestorableGci; @@ -1361,7 +1361,8 @@ private: Uint32 cstarttype; Uint32 csystemnodes; Uint32 currentgcp; - + Uint32 c_newest_restorable_gci; + enum GcpMasterTakeOverState { GMTOS_IDLE = 0, GMTOS_INITIAL = 1, diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index 97cd8c374c6..1d2124e1b32 100644 --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -674,6 +674,7 @@ void Dbdih::execCOPY_GCIREQ(Signal* signal) jam(); coldgcp = SYSFILE->newestRestorableGCI; crestartGci = SYSFILE->newestRestorableGCI; + c_newest_restorable_gci = SYSFILE->newestRestorableGCI; Sysfile::setRestartOngoing(SYSFILE->systemRestartBits); currentgcp = coldgcp + 1; cnewgcp = coldgcp + 1; @@ -692,6 +693,7 @@ void Dbdih::execCOPY_GCIREQ(Signal* signal) ok = true; jam(); cgcpParticipantState = GCP_PARTICIPANT_COPY_GCI_RECEIVED; + c_newest_restorable_gci = SYSFILE->newestRestorableGCI; setNodeInfo(signal); break; }//if @@ -7749,6 +7751,8 @@ void Dbdih::execCOPY_GCICONF(Signal* signal) signal->theData[1] = coldgcp; sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB); + c_newest_restorable_gci = coldgcp; + CRASH_INSERTION(7004); emptyWaitGCPMasterQueue(signal); cgcpStatus = GCP_READY; @@ -9155,7 +9159,7 @@ void Dbdih::checkTcCounterLab(Signal* signal) }//if c_lcpState.ctimer += 32; if ((c_nodeStartMaster.blockLcp == true) || - ((c_lcpState.lcpStartGcp + 1) > currentgcp)) { + (c_lcpState.lcpStopGcp >= c_newest_restorable_gci)) { jam(); /* --------------------------------------------------------------------- */ // No reason to start juggling the states and checking for start of LCP if @@ -9238,7 +9242,6 @@ void Dbdih::execTCGETOPSIZECONF(Signal* signal) /* ----------------------------------------------------------------------- */ c_lcpState.ctimer = 0; c_lcpState.keepGci = coldgcp; - c_lcpState.lcpStartGcp = currentgcp; /* ----------------------------------------------------------------------- */ /* UPDATE THE NEW LATEST LOCAL CHECKPOINT ID. */ /* ----------------------------------------------------------------------- */ @@ -9310,7 +9313,7 @@ void Dbdih::calculateKeepGciLab(Signal* signal, Uint32 tableId, Uint32 fragId) cnoOfActiveTables++; FragmentstorePtr fragPtr; getFragstore(tabPtr.p, fragId, fragPtr); - checkKeepGci(fragPtr.p->storedReplicas); + checkKeepGci(tabPtr, fragId, fragPtr.p, fragPtr.p->storedReplicas); fragId++; if (fragId >= tabPtr.p->totalfragments) { jam(); @@ -10168,6 +10171,7 @@ void Dbdih::allNodesLcpCompletedLab(Signal* signal) signal->theData[0] = EventReport::LocalCheckpointCompleted; //Event type signal->theData[1] = SYSFILE->latestLCP_ID; sendSignal(CMVMI_REF, GSN_EVENT_REP, signal, 2, JBB); + c_lcpState.lcpStopGcp = c_newest_restorable_gci; /** * Start checking for next LCP @@ -10522,7 +10526,8 @@ void Dbdih::checkEscalation() /* DESCRIPTION: CHECK FOR MINIMUM GCI RESTORABLE WITH NEW LOCAL */ /* CHECKPOINT. */ /*************************************************************************/ -void Dbdih::checkKeepGci(Uint32 replicaStartIndex) +void Dbdih::checkKeepGci(TabRecordPtr tabPtr, Uint32 fragId, Fragmentstore*, + Uint32 replicaStartIndex) { ReplicaRecordPtr ckgReplicaPtr; ckgReplicaPtr.i = replicaStartIndex; @@ -10544,7 +10549,6 @@ void Dbdih::checkKeepGci(Uint32 replicaStartIndex) if (oldestRestorableGci > c_lcpState.oldestRestorableGci) { jam(); c_lcpState.oldestRestorableGci = oldestRestorableGci; - ndbrequire(((int)c_lcpState.oldestRestorableGci) >= 0); }//if ckgReplicaPtr.i = ckgReplicaPtr.p->nextReplica; }//while @@ -10838,7 +10842,7 @@ void Dbdih::findMinGci(ReplicaRecordPtr fmgReplicaPtr, do { ndbrequire(lcpNo < MAX_LCP_STORED); if (fmgReplicaPtr.p->lcpStatus[lcpNo] == ZVALID && - fmgReplicaPtr.p->maxGciStarted[lcpNo] <= coldgcp) + fmgReplicaPtr.p->maxGciStarted[lcpNo] < c_newest_restorable_gci) { jam(); keepGci = fmgReplicaPtr.p->maxGciCompleted[lcpNo]; @@ -10960,7 +10964,7 @@ void Dbdih::initCommonData() c_lcpState.clcpDelay = 0; c_lcpState.lcpStart = ZIDLE; - c_lcpState.lcpStartGcp = 0; + c_lcpState.lcpStopGcp = 0; c_lcpState.setLcpStatus(LCP_STATUS_IDLE, __LINE__); c_lcpState.currentFragment.tableId = 0; c_lcpState.currentFragment.fragmentId = 0; @@ -10996,6 +11000,7 @@ void Dbdih::initCommonData() csystemnodes = 0; c_updateToLock = RNIL; currentgcp = 0; + c_newest_restorable_gci = 0; cverifyQueueCounter = 0; cwaitLcpSr = false; @@ -11067,6 +11072,7 @@ void Dbdih::initRestartInfo() currentgcp = 2; cnewgcp = 2; crestartGci = 1; + c_newest_restorable_gci = 1; SYSFILE->keepGCI = 1; SYSFILE->oldestRestorableGCI = 1; @@ -13038,9 +13044,9 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) if (signal->theData[0] == 7001) { infoEvent("c_lcpState.keepGci = %d", c_lcpState.keepGci); - infoEvent("c_lcpState.lcpStatus = %d, clcpStartGcp = %d", + infoEvent("c_lcpState.lcpStatus = %d, clcpStopGcp = %d", c_lcpState.lcpStatus, - c_lcpState.lcpStartGcp); + c_lcpState.lcpStopGcp); infoEvent("cgcpStartCounter = %d, cimmediateLcpStart = %d", cgcpStartCounter, c_lcpState.immediateLcpStart); }//if @@ -13221,8 +13227,8 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) infoEvent("lcpStatus = %d (update place = %d) ", c_lcpState.lcpStatus, c_lcpState.lcpStatusUpdatedPlace); infoEvent - ("lcpStart = %d lcpStartGcp = %d keepGci = %d oldestRestorable = %d", - c_lcpState.lcpStart, c_lcpState.lcpStartGcp, + ("lcpStart = %d lcpStopGcp = %d keepGci = %d oldestRestorable = %d", + c_lcpState.lcpStart, c_lcpState.lcpStopGcp, c_lcpState.keepGci, c_lcpState.oldestRestorableGci); infoEvent From 72a0f506356f74281ede079288566ae15bb8c1cf Mon Sep 17 00:00:00 2001 From: unknown Date: Fri, 10 Feb 2006 10:10:52 +0100 Subject: [PATCH 5/5] merge --- ndb/src/kernel/blocks/dbdict/Dbdict.cpp | 2 +- ndb/src/kernel/blocks/dblqh/DblqhMain.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp index 0052ec0588f..ca9daca428b 100644 --- a/ndb/src/kernel/blocks/dbdict/Dbdict.cpp +++ b/ndb/src/kernel/blocks/dbdict/Dbdict.cpp @@ -2584,7 +2584,7 @@ Dbdict::restartCreateTab_readTableConf(Signal* signal, c_readTableRecord.tableId, parseRecord.errorCode); progError(__LINE__, - ERR_INVALID_CONFIG, + NDBD_EXIT_INVALID_CONFIG, buf); ndbrequire(parseRecord.errorCode == 0); } diff --git a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp index fdb99719643..b75263b747d 100644 --- a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp +++ b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp @@ -16176,7 +16176,7 @@ error: "Unable to restart, failed while reading redo." " Likely invalid change of configuration"); progError(__LINE__, - ERR_INVALID_CONFIG, + NDBD_EXIT_INVALID_CONFIG, buf); }//Dblqh::findLogfile()