Replace old watchdog script with the new system that we used on Failstation, plus some fixes.

This commit is contained in:
Rob Nelson
2013-08-13 00:09:17 -07:00
parent 51c137cbd9
commit 61f2e7fa02

View File

@@ -1,51 +1,157 @@
import subprocess
import socket
import urlparse
import os
import struct
import time
import urllib
import json
import logging
import logging.handlers
UDP_IP="127.0.0.1"
UDP_PORT=8019
MONITOR = ('127.0.0.1',1336) # IP, port.
RESTART_COMMAND="/home/gmod/byond/ss13.sh"
STATS_FILE='/home/gmod/stats.json'
MAX_FAILURES=3
LOGPATH='/home/gmod/byond/crashlogs/'
TIMEOUT=30.0
sock = socket.socket( socket.AF_INET, # Internet
socket.SOCK_DGRAM ) # UDP
sock.bind( (UDP_IP,UDP_PORT) )
# Return True for success, False otherwise.
def open_socket():
# Open TCP socket to target.
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect(MONITOR)
# 30-second timeout
s.settimeout(TIMEOUT)
return s
# Snippet below from http://pastebin.com/TGhPBPGp
def decode_packet(packet):
if packet != "":
if packet[0] == b'\x00' or packet[1] == b'\x83': # make sure it's the right packet format
# Actually begin reading the output:
sizebytes = struct.unpack('>H', packet[2]+packet[3]) # array size of the type identifier and content # ROB: Big-endian!
#print(repr(sizebytes))
size = sizebytes[0] - 1 # size of the string/floating-point (minus the size of the identifier byte)
if packet[4] == b'\x2a': # 4-byte big-endian floating-point
unpackint = struct.unpack('f', packet[5]+packet[6]+packet[7]+packet[8]) # 4 possible bytes: add them up together, unpack them as a floating-point
return unpackint[1]
elif packet[4] == b'\x06': # ASCII string
unpackstr = '' # result string
index = 5 # string index
while (size > 0): # loop through the entire ASCII string
size -= 1
unpackstr = unpackstr+packet[index] # add the string position to return string
index += 1
return unpackstr.replace('\x00','')
log.error('UNKNOWN PACKET: {0}'.format(repr(packet)))
return b''
def ping_server(request):
try:
# Snippet below from http://pastebin.com/TGhPBPGp
#==============================================================
# All queries must begin with a question mark (ie "?players")
if request[0] != b'?':
request = b'?' + request
# --- Prepare a packet to send to the server (based on a reverse-engineered packet structure) ---
query = b'\x00\x83'
query += struct.pack('>H', len(request) + 6) # Rob: BIG-endian
query += b'\x00\x00\x00\x00\x00'
query += request
query += b'\x00'
#==============================================================
last_ticker_state = None
s = open_socket()
if s is None:
return False
#print 'Sending query packet...'
s.sendall(query)
#print 'Receiving response...'
data = b''
while True:
buf = s.recv(1024)
data += buf
szbuf = len(buf)
#print('<',szbuf)
if szbuf<1024:
break
s.close()
response = decode_packet(data)
if response is not None:
response = response.replace('\x00','')
#print 'Received: ', response
parsed_response = {}
reserved_keys=['ai','respawn','admins', 'players', 'host', 'version', 'mode', 'enter', 'vote','playerlist']
for chunk in response.split('&'):
dt = chunk.split('=')
if dt[0] not in reserved_keys:
if 'playerlist' not in parsed_response:
parsed_response['playerlist'] = []
parsed_response['playerlist'] += [ dt[0] ]
else:
parsed_response[dt[0]] = ''
if len(dt) == 2:
parsed_response[dt[0]] = urllib.unquote(dt[1])
#print 'Received: ', repr(parsed_response) #, response
# {'ai': '1', 'respawn': '0', 'admins': '0', 'players': '0', 'host': '', 'version': '/vg/+Station+13', 'mode': 'secret', 'enter': '1', 'vote': '0'}
with open(STATS_FILE,'w') as f:
json.dump(parsed_response,f)
else:
log.error("Received NONE from server!")
return False
except socket.timeout:
log.error("Socket timed out!")
return False
except socket.error:
log.error("Connection lost!")
return False
return True
def handle_message(data, addr):
global last_ticker_state
if not os.path.isdir(LOGPATH):
os.makedirs(LOGPATH)
logFormatter = logging.Formatter(fmt='%(asctime)s [%(levelname)-8s]: %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p') #, level=logging.INFO, filename='crashlog.log', filemode='a+')
log = logging.getLogger()
log.setLevel(logging.INFO)
params = urlparse.parse_qs(data)
print(data)
fileHandler = logging.handlers.RotatingFileHandler(os.path.join(LOGPATH, 'crash.log'), maxBytes=1024*1024*50, backupCount=0) # 50MB
fileHandler.setFormatter(logFormatter)
log.addHandler(fileHandler)
try:
if params["type"][0] == "log" and str(params["log"][0]) and str(params["message"][0]):
open(params["log"][0],"a+").write(params["message"][0]+"\n")
except IOError:
pass
except KeyError:
pass
consoleHandler = logging.StreamHandler()
consoleHandler.setFormatter(logFormatter)
log.addHandler(consoleHandler)
try:
if params["type"][0] == "ticker_state" and str(params["message"][0]):
last_ticker_state = str(params["message"][0])
except KeyError:
pass
try:
if params["type"][0] == "startup" and last_ticker_state:
open("crashlog.txt","a+").write("Server exited, last ticker state was: "+last_ticker_state+"\n")
except KeyError:
pass
sock.settimeout(60*6) # 10 minute timeout
log.info('/vg/station Watchdog: Started.')
lastState=True
failChain=0
firstRun=True
while True:
try:
data, addr = sock.recvfrom( 1024 ) # buffer size is 1024 bytes
handle_message(data,addr)
except socket.timeout:
# try to start the server again
print("Server timed out.. attempting restart.")
if last_ticker_state:
open("crashmsg.txt","a+").write("Server crashed, trying to reboot. last ticker state: "+last_ticker_state+"\n")
subprocess.call("killall -9 DreamDaemon")
subprocess.call("./start")
if not ping_server(b'?status'):
# try to start the server again
failChain += 1
if lastState == False:
if failChain > MAX_FAILURES:
log.error('Too many failures, quitting.')
sys.exit(1)
log.error('Try {0}/{1}...'.format(failChain,MAX_FAILURES))
else:
log.error("Detected a problem, attempting restart ({0}/{1}.".format(failChain,MAX_FAILURES))
subprocess.call(RESTART_COMMAND,shell=True)
time.sleep(50) # Sleep 50 seconds for a total of one minute before we ping again.
lastState=False
else:
if lastState == False:
log.info('Server is confirmed to be back up and running.')
if firstRun:
log.info('Server is confirmed to be up and running.')
lastState=True
failChain=0
firstRun=False
time.sleep(10) # Ten seconds between "pings".