Merge pull request #10628 from VOREStation/Arokha/ssrobust

Improve subsystem robustness
This commit is contained in:
Novacat
2021-06-12 18:05:27 -04:00
committed by Chompstation Bot
parent e876bc2107
commit e497461cd3
4 changed files with 47 additions and 35 deletions

View File

@@ -57,23 +57,23 @@ var/datum/controller/failsafe/Failsafe
if(4,5)
--defcon
if(3)
to_chat(GLOB.admins, "<span class='adminnotice'>Notice: DEFCON [defcon_pretty()]. The Master Controller has not fired in the last [(5-defcon) * processing_interval] ticks.</span>")
log_and_message_admins("<span class='adminnotice'>SSfailsafe Notice: DEFCON [defcon_pretty()]. The Master Controller (\ref[Master]) has not fired in the last [(5-defcon) * processing_interval] ticks.</span>")
--defcon
if(2)
to_chat(GLOB.admins, "<span class='boldannounce'>Warning: DEFCON [defcon_pretty()]. The Master Controller has not fired in the last [(5-defcon) * processing_interval] ticks. Automatic restart in [processing_interval] ticks.</span>")
log_and_message_admins("<span class='boldannounce'>SSfailsafe Warning: DEFCON [defcon_pretty()]. The Master Controller (\ref[Master]) has not fired in the last [(5-defcon) * processing_interval] ticks. Automatic restart in [processing_interval] ticks.</span>")
--defcon
if(1)
to_chat(GLOB.admins, "<span class='boldannounce'>Warning: DEFCON [defcon_pretty()]. The Master Controller has still not fired within the last [(5-defcon) * processing_interval] ticks. Killing and restarting...</span>")
log_and_message_admins("<span class='boldannounce'>SSfailsafe Warning: DEFCON [defcon_pretty()]. The Master Controller (\ref[Master]) has still not fired within the last [(5-defcon) * processing_interval] ticks. Killing and restarting...</span>")
--defcon
var/rtn = Recreate_MC()
if(rtn > 0)
defcon = 4
master_iteration = 0
to_chat(GLOB.admins, "<span class='adminnotice'>MC restarted successfully</span>")
log_and_message_admins("<span class='adminnotice'>SSfailsafe Notice: MC (New:\ref[Master]) restarted successfully</span>")
else if(rtn < 0)
log_world("FailSafe: Could not restart MC, runtime encountered. Entering defcon 0")
to_chat(GLOB.admins, "<span class='boldannounce'>ERROR: DEFCON [defcon_pretty()]. Could not restart MC, runtime encountered. I will silently keep retrying.</span>")
log_game("SSfailsafe Notice: Could not restart MC (\ref[Master]), runtime encountered. Entering defcon 0")
log_and_message_admins("<span class='boldannounce'>SSFAILSAFE ERROR: DEFCON [defcon_pretty()]. Could not restart MC (\ref[Master]), runtime encountered. I will silently keep retrying.</span>")
//if the return number was 0, it just means the mc was restarted too recently, and it just needs some time before we try again
//no need to handle that specially when defcon 0 can handle it
if(0) //DEFCON 0! (mc failed to restart)
@@ -81,7 +81,7 @@ var/datum/controller/failsafe/Failsafe
if(rtn > 0)
defcon = 4
master_iteration = 0
to_chat(GLOB.admins, "<span class='adminnotice'>MC restarted successfully</span>")
log_and_message_admins("<span class='adminnotice'>SSfailsafe Notice: MC (New:\ref[Master]) restarted successfully</span>")
else
defcon = min(defcon + 1,5)
master_iteration = Master.iteration

View File

@@ -132,11 +132,11 @@ GLOBAL_REAL(Master, /datum/controller/master) = new
LAZYINITLIST(BadBoy.failure_strikes)
switch(++BadBoy.failure_strikes[BadBoy.type])
if(2)
msg = "The [BadBoy.name] subsystem was the last to fire for 2 controller restarts. It will be recovered now and disabled if it happens again."
msg = "MC Notice: The [BadBoy.name] subsystem was the last to fire for 2 controller restarts. It will be recovered now and disabled if it happens again."
FireHim = TRUE
BadBoy.fail()
if(3)
msg = "The [BadBoy.name] subsystem seems to be destabilizing the MC and will be offlined."
msg = "MC Notice: The [BadBoy.name] subsystem seems to be destabilizing the MC and will be offlined."
BadBoy.flags |= SS_NO_FIRE
BadBoy.critfail()
if(msg)
@@ -152,7 +152,7 @@ GLOBAL_REAL(Master, /datum/controller/master) = new
current_runlevel = Master.current_runlevel
StartProcessing(10)
else
to_chat(world, "<span class='boldannounce'>The Master Controller is having some issues, we will need to re-initialize EVERYTHING</span>")
to_world("<span class='boldannounce'>The Master Controller is having some issues, we will need to re-initialize EVERYTHING</span>")
Initialize(20, TRUE)
@@ -170,7 +170,7 @@ GLOBAL_REAL(Master, /datum/controller/master) = new
if(init_sss)
init_subtypes(/datum/controller/subsystem, subsystems)
to_chat(world, "<span class='boldannounce'>Initializing subsystems...</span>")
to_chat(world, "<span class='boldannounce'>MC: Initializing subsystems...</span>")
// Sort subsystems by init_order, so they initialize in the correct order.
sortTim(subsystems, /proc/cmp_subsystem_init)
@@ -186,7 +186,7 @@ GLOBAL_REAL(Master, /datum/controller/master) = new
current_ticklimit = TICK_LIMIT_RUNNING
var/time = (REALTIMEOFDAY - start_timeofday) / 10
var/msg = "Initializations complete within [time] second[time == 1 ? "" : "s"]!"
var/msg = "MC: Initializations complete within [time] second[time == 1 ? "" : "s"]!"
to_chat(world, "<span class='boldannounce'>[msg]</span>")
log_world(msg)
@@ -229,15 +229,17 @@ GLOBAL_REAL(Master, /datum/controller/master) = new
if (rtn > 0 || processing < 0)
return //this was suppose to happen.
//loop ended, restart the mc
log_game("MC crashed or runtimed, restarting")
message_admins("MC crashed or runtimed, restarting")
log_world("MC crashed or runtimed, restarting")
log_and_message_admins("MC Notice: MC crashed or runtimed, self-restarting (\ref[src])")
var/rtn2 = Recreate_MC()
if (rtn2 <= 0)
log_game("Failed to recreate MC (Error code: [rtn2]), it's up to the failsafe now")
message_admins("Failed to recreate MC (Error code: [rtn2]), it's up to the failsafe now")
log_world("Failed to recreate MC (Error code: [rtn2]), it's up to the failsafe now")
Failsafe.defcon = 2
switch(rtn2)
if(-1)
log_and_message_admins("MC Warning: Failed to self-recreate MC (Return code: [rtn2]), it's up to the failsafe now (\ref[src])")
Failsafe.defcon = 2
if(0)
log_and_message_admins("MC Warning: Too soon for MC self-restart (Return code: [rtn2]), going to let failsafe handle it (\ref[src])")
Failsafe.defcon = 2
if(1)
log_and_message_admins("MC Notice: MC self-recreated, old MC departing (Return code: [rtn2]) (\ref[src])")
// Main loop.
/datum/controller/master/proc/Loop()

View File

@@ -68,10 +68,14 @@ Total Unsimulated Turfs: [world.maxx*world.maxy*world.maxz - simulated_turf_coun
/datum/controller/subsystem/air/fire(resumed = 0)
var/timer
if(!resumed)
ASSERT(LAZYLEN(currentrun) == 0) // Santity checks to make sure we don't somehow have items left over from last cycle
ASSERT(current_step == null) // Or somehow didn't finish all the steps from last cycle
current_cycle++ // Begin a new air_master cycle!
current_step = SSAIR_TURFS // Start with Step 1 of course
// Santity checks to make sure we don't somehow have items left over from last cycle
// Or somehow didn't finish all the steps from last cycle
if(LAZYLEN(currentrun) || current_step)
log_and_message_admins("SSair: Was told to start a new run, but the previous run wasn't finished! currentrun.len=[currentrun.len], current_step=[current_step]")
resumed = TRUE
else
current_cycle++ // Begin a new air_master cycle!
current_step = SSAIR_TURFS // Start with Step 1 of course
INTERNAL_PROCESS_STEP(SSAIR_TURFS, TRUE, process_tiles_to_update, cost_turfs, SSAIR_EDGES)
INTERNAL_PROCESS_STEP(SSAIR_EDGES, FALSE, process_active_edges, cost_edges, SSAIR_FIREZONES)
@@ -80,10 +84,11 @@ Total Unsimulated Turfs: [world.maxx*world.maxy*world.maxz - simulated_turf_coun
INTERNAL_PROCESS_STEP(SSAIR_ZONES, FALSE, process_zones_to_update, cost_zones, SSAIR_DONE)
// Okay, we're done! Woo! Got thru a whole air_master cycle!
ASSERT(LAZYLEN(currentrun) == 0) // Sanity checks to make sure there are really none left
ASSERT(current_step == SSAIR_DONE) // And that we didn't somehow skip past the last step
currentrun = null
current_step = null
if(LAZYLEN(currentrun) || current_step != SSAIR_DONE)
log_and_message_admins("SSair: Was not able to complete a full air cycle despite reaching the end of fire(). This shouldn't happen.")
else
currentrun = null
current_step = null
/datum/controller/subsystem/air/proc/process_tiles_to_update(resumed = 0)
if (!resumed)

View File

@@ -45,9 +45,13 @@ SUBSYSTEM_DEF(lighting)
/datum/controller/subsystem/lighting/fire(resumed = FALSE)
var/timer
if(!resumed)
ASSERT(LAZYLEN(currentrun) == 0) // Santity checks to make sure we don't somehow have items left over from last cycle
ASSERT(stage == null) // Or somehow didn't finish all the steps from last cycle
stage = SSLIGHTING_STAGE_LIGHTS // Start with Step 1 of course
// Santity checks to make sure we don't somehow have items left over from last cycle
// Or somehow didn't finish all the steps from last cycle
if(LAZYLEN(currentrun) || stage)
log_and_message_admins("SSlighting: Was told to start a new run, but the previous run wasn't finished! currentrun.len=[currentrun.len], stage=[stage]")
resumed = TRUE
else
stage = SSLIGHTING_STAGE_LIGHTS // Start with Step 1 of course
if(stage == SSLIGHTING_STAGE_LIGHTS)
timer = TICK_USAGE
@@ -77,10 +81,11 @@ SUBSYSTEM_DEF(lighting)
stage = SSLIGHTING_STAGE_DONE
// Okay, we're done! Woo! Got thru a whole air_master cycle!
ASSERT(LAZYLEN(currentrun) == 0) // Sanity checks to make sure there are really none left
ASSERT(stage == SSLIGHTING_STAGE_DONE) // And that we didn't somehow skip past the last step
currentrun = null
stage = null
if(LAZYLEN(currentrun) || stage != SSLIGHTING_STAGE_DONE)
log_and_message_admins("SSlighting: Was not able to complete a full lighting cycle despite reaching the end of fire(). This shouldn't happen.")
else
currentrun = null
stage = null
/datum/controller/subsystem/lighting/proc/internal_process_lights(resumed = FALSE, init_tick_checks = FALSE)
if (!resumed)