Merge pull request #10628 from VOREStation/Arokha/ssrobust

Improve subsystem robustness
This commit is contained in:
Novacat
2021-06-12 18:05:27 -04:00
committed by Chompstation Bot
parent e876bc2107
commit e497461cd3
4 changed files with 47 additions and 35 deletions

View File

@@ -57,23 +57,23 @@ var/datum/controller/failsafe/Failsafe
if(4,5) if(4,5)
--defcon --defcon
if(3) if(3)
to_chat(GLOB.admins, "<span class='adminnotice'>Notice: DEFCON [defcon_pretty()]. The Master Controller has not fired in the last [(5-defcon) * processing_interval] ticks.</span>") log_and_message_admins("<span class='adminnotice'>SSfailsafe Notice: DEFCON [defcon_pretty()]. The Master Controller (\ref[Master]) has not fired in the last [(5-defcon) * processing_interval] ticks.</span>")
--defcon --defcon
if(2) if(2)
to_chat(GLOB.admins, "<span class='boldannounce'>Warning: DEFCON [defcon_pretty()]. The Master Controller has not fired in the last [(5-defcon) * processing_interval] ticks. Automatic restart in [processing_interval] ticks.</span>") log_and_message_admins("<span class='boldannounce'>SSfailsafe Warning: DEFCON [defcon_pretty()]. The Master Controller (\ref[Master]) has not fired in the last [(5-defcon) * processing_interval] ticks. Automatic restart in [processing_interval] ticks.</span>")
--defcon --defcon
if(1) if(1)
to_chat(GLOB.admins, "<span class='boldannounce'>Warning: DEFCON [defcon_pretty()]. The Master Controller has still not fired within the last [(5-defcon) * processing_interval] ticks. Killing and restarting...</span>") log_and_message_admins("<span class='boldannounce'>SSfailsafe Warning: DEFCON [defcon_pretty()]. The Master Controller (\ref[Master]) has still not fired within the last [(5-defcon) * processing_interval] ticks. Killing and restarting...</span>")
--defcon --defcon
var/rtn = Recreate_MC() var/rtn = Recreate_MC()
if(rtn > 0) if(rtn > 0)
defcon = 4 defcon = 4
master_iteration = 0 master_iteration = 0
to_chat(GLOB.admins, "<span class='adminnotice'>MC restarted successfully</span>") log_and_message_admins("<span class='adminnotice'>SSfailsafe Notice: MC (New:\ref[Master]) restarted successfully</span>")
else if(rtn < 0) else if(rtn < 0)
log_world("FailSafe: Could not restart MC, runtime encountered. Entering defcon 0") log_game("SSfailsafe Notice: Could not restart MC (\ref[Master]), runtime encountered. Entering defcon 0")
to_chat(GLOB.admins, "<span class='boldannounce'>ERROR: DEFCON [defcon_pretty()]. Could not restart MC, runtime encountered. I will silently keep retrying.</span>") log_and_message_admins("<span class='boldannounce'>SSFAILSAFE ERROR: DEFCON [defcon_pretty()]. Could not restart MC (\ref[Master]), runtime encountered. I will silently keep retrying.</span>")
//if the return number was 0, it just means the mc was restarted too recently, and it just needs some time before we try again //if the return number was 0, it just means the mc was restarted too recently, and it just needs some time before we try again
//no need to handle that specially when defcon 0 can handle it //no need to handle that specially when defcon 0 can handle it
if(0) //DEFCON 0! (mc failed to restart) if(0) //DEFCON 0! (mc failed to restart)
@@ -81,7 +81,7 @@ var/datum/controller/failsafe/Failsafe
if(rtn > 0) if(rtn > 0)
defcon = 4 defcon = 4
master_iteration = 0 master_iteration = 0
to_chat(GLOB.admins, "<span class='adminnotice'>MC restarted successfully</span>") log_and_message_admins("<span class='adminnotice'>SSfailsafe Notice: MC (New:\ref[Master]) restarted successfully</span>")
else else
defcon = min(defcon + 1,5) defcon = min(defcon + 1,5)
master_iteration = Master.iteration master_iteration = Master.iteration

View File

@@ -132,11 +132,11 @@ GLOBAL_REAL(Master, /datum/controller/master) = new
LAZYINITLIST(BadBoy.failure_strikes) LAZYINITLIST(BadBoy.failure_strikes)
switch(++BadBoy.failure_strikes[BadBoy.type]) switch(++BadBoy.failure_strikes[BadBoy.type])
if(2) if(2)
msg = "The [BadBoy.name] subsystem was the last to fire for 2 controller restarts. It will be recovered now and disabled if it happens again." msg = "MC Notice: The [BadBoy.name] subsystem was the last to fire for 2 controller restarts. It will be recovered now and disabled if it happens again."
FireHim = TRUE FireHim = TRUE
BadBoy.fail() BadBoy.fail()
if(3) if(3)
msg = "The [BadBoy.name] subsystem seems to be destabilizing the MC and will be offlined." msg = "MC Notice: The [BadBoy.name] subsystem seems to be destabilizing the MC and will be offlined."
BadBoy.flags |= SS_NO_FIRE BadBoy.flags |= SS_NO_FIRE
BadBoy.critfail() BadBoy.critfail()
if(msg) if(msg)
@@ -152,7 +152,7 @@ GLOBAL_REAL(Master, /datum/controller/master) = new
current_runlevel = Master.current_runlevel current_runlevel = Master.current_runlevel
StartProcessing(10) StartProcessing(10)
else else
to_chat(world, "<span class='boldannounce'>The Master Controller is having some issues, we will need to re-initialize EVERYTHING</span>") to_world("<span class='boldannounce'>The Master Controller is having some issues, we will need to re-initialize EVERYTHING</span>")
Initialize(20, TRUE) Initialize(20, TRUE)
@@ -170,7 +170,7 @@ GLOBAL_REAL(Master, /datum/controller/master) = new
if(init_sss) if(init_sss)
init_subtypes(/datum/controller/subsystem, subsystems) init_subtypes(/datum/controller/subsystem, subsystems)
to_chat(world, "<span class='boldannounce'>Initializing subsystems...</span>") to_chat(world, "<span class='boldannounce'>MC: Initializing subsystems...</span>")
// Sort subsystems by init_order, so they initialize in the correct order. // Sort subsystems by init_order, so they initialize in the correct order.
sortTim(subsystems, /proc/cmp_subsystem_init) sortTim(subsystems, /proc/cmp_subsystem_init)
@@ -186,7 +186,7 @@ GLOBAL_REAL(Master, /datum/controller/master) = new
current_ticklimit = TICK_LIMIT_RUNNING current_ticklimit = TICK_LIMIT_RUNNING
var/time = (REALTIMEOFDAY - start_timeofday) / 10 var/time = (REALTIMEOFDAY - start_timeofday) / 10
var/msg = "Initializations complete within [time] second[time == 1 ? "" : "s"]!" var/msg = "MC: Initializations complete within [time] second[time == 1 ? "" : "s"]!"
to_chat(world, "<span class='boldannounce'>[msg]</span>") to_chat(world, "<span class='boldannounce'>[msg]</span>")
log_world(msg) log_world(msg)
@@ -229,15 +229,17 @@ GLOBAL_REAL(Master, /datum/controller/master) = new
if (rtn > 0 || processing < 0) if (rtn > 0 || processing < 0)
return //this was suppose to happen. return //this was suppose to happen.
//loop ended, restart the mc //loop ended, restart the mc
log_game("MC crashed or runtimed, restarting") log_and_message_admins("MC Notice: MC crashed or runtimed, self-restarting (\ref[src])")
message_admins("MC crashed or runtimed, restarting")
log_world("MC crashed or runtimed, restarting")
var/rtn2 = Recreate_MC() var/rtn2 = Recreate_MC()
if (rtn2 <= 0) switch(rtn2)
log_game("Failed to recreate MC (Error code: [rtn2]), it's up to the failsafe now") if(-1)
message_admins("Failed to recreate MC (Error code: [rtn2]), it's up to the failsafe now") log_and_message_admins("MC Warning: Failed to self-recreate MC (Return code: [rtn2]), it's up to the failsafe now (\ref[src])")
log_world("Failed to recreate MC (Error code: [rtn2]), it's up to the failsafe now")
Failsafe.defcon = 2 Failsafe.defcon = 2
if(0)
log_and_message_admins("MC Warning: Too soon for MC self-restart (Return code: [rtn2]), going to let failsafe handle it (\ref[src])")
Failsafe.defcon = 2
if(1)
log_and_message_admins("MC Notice: MC self-recreated, old MC departing (Return code: [rtn2]) (\ref[src])")
// Main loop. // Main loop.
/datum/controller/master/proc/Loop() /datum/controller/master/proc/Loop()

View File

@@ -68,8 +68,12 @@ Total Unsimulated Turfs: [world.maxx*world.maxy*world.maxz - simulated_turf_coun
/datum/controller/subsystem/air/fire(resumed = 0) /datum/controller/subsystem/air/fire(resumed = 0)
var/timer var/timer
if(!resumed) if(!resumed)
ASSERT(LAZYLEN(currentrun) == 0) // Santity checks to make sure we don't somehow have items left over from last cycle // Santity checks to make sure we don't somehow have items left over from last cycle
ASSERT(current_step == null) // Or somehow didn't finish all the steps from last cycle // Or somehow didn't finish all the steps from last cycle
if(LAZYLEN(currentrun) || current_step)
log_and_message_admins("SSair: Was told to start a new run, but the previous run wasn't finished! currentrun.len=[currentrun.len], current_step=[current_step]")
resumed = TRUE
else
current_cycle++ // Begin a new air_master cycle! current_cycle++ // Begin a new air_master cycle!
current_step = SSAIR_TURFS // Start with Step 1 of course current_step = SSAIR_TURFS // Start with Step 1 of course
@@ -80,8 +84,9 @@ Total Unsimulated Turfs: [world.maxx*world.maxy*world.maxz - simulated_turf_coun
INTERNAL_PROCESS_STEP(SSAIR_ZONES, FALSE, process_zones_to_update, cost_zones, SSAIR_DONE) INTERNAL_PROCESS_STEP(SSAIR_ZONES, FALSE, process_zones_to_update, cost_zones, SSAIR_DONE)
// Okay, we're done! Woo! Got thru a whole air_master cycle! // Okay, we're done! Woo! Got thru a whole air_master cycle!
ASSERT(LAZYLEN(currentrun) == 0) // Sanity checks to make sure there are really none left if(LAZYLEN(currentrun) || current_step != SSAIR_DONE)
ASSERT(current_step == SSAIR_DONE) // And that we didn't somehow skip past the last step log_and_message_admins("SSair: Was not able to complete a full air cycle despite reaching the end of fire(). This shouldn't happen.")
else
currentrun = null currentrun = null
current_step = null current_step = null

View File

@@ -45,8 +45,12 @@ SUBSYSTEM_DEF(lighting)
/datum/controller/subsystem/lighting/fire(resumed = FALSE) /datum/controller/subsystem/lighting/fire(resumed = FALSE)
var/timer var/timer
if(!resumed) if(!resumed)
ASSERT(LAZYLEN(currentrun) == 0) // Santity checks to make sure we don't somehow have items left over from last cycle // Santity checks to make sure we don't somehow have items left over from last cycle
ASSERT(stage == null) // Or somehow didn't finish all the steps from last cycle // Or somehow didn't finish all the steps from last cycle
if(LAZYLEN(currentrun) || stage)
log_and_message_admins("SSlighting: Was told to start a new run, but the previous run wasn't finished! currentrun.len=[currentrun.len], stage=[stage]")
resumed = TRUE
else
stage = SSLIGHTING_STAGE_LIGHTS // Start with Step 1 of course stage = SSLIGHTING_STAGE_LIGHTS // Start with Step 1 of course
if(stage == SSLIGHTING_STAGE_LIGHTS) if(stage == SSLIGHTING_STAGE_LIGHTS)
@@ -77,8 +81,9 @@ SUBSYSTEM_DEF(lighting)
stage = SSLIGHTING_STAGE_DONE stage = SSLIGHTING_STAGE_DONE
// Okay, we're done! Woo! Got thru a whole air_master cycle! // Okay, we're done! Woo! Got thru a whole air_master cycle!
ASSERT(LAZYLEN(currentrun) == 0) // Sanity checks to make sure there are really none left if(LAZYLEN(currentrun) || stage != SSLIGHTING_STAGE_DONE)
ASSERT(stage == SSLIGHTING_STAGE_DONE) // And that we didn't somehow skip past the last step log_and_message_admins("SSlighting: Was not able to complete a full lighting cycle despite reaching the end of fire(). This shouldn't happen.")
else
currentrun = null currentrun = null
stage = null stage = null