[Buildbot-commits] buildbot/buildbot/slave bot.py,1.5,1.6

Brian Warner warner at users.sourceforge.net
Sat Dec 11 11:12:37 UTC 2004


Update of /cvsroot/buildbot/buildbot/buildbot/slave
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3363/buildbot/slave

Modified Files:
	bot.py 
Log Message:
* buildbot/slave/bot.py (BotFactory): revamp keepalive/lost-master detection
code. Require some sign of life from the buildmaster every
BotFactory.keepaliveInterval seconds. Provoke this indication at
BotFactory.keepaliveTimeout seconds before the deadline by sending a
keepalive request. We don't actually care if that request is answered in a
timely fashion, what we care about is that .activity() is called before the
deadline. .activity() is triggered by any PB message from the master
(including an ack to one of the slave's status-update messages). With this
new scheme, large status messages over slow pipes are OK, as long as any
given message can be sent (and thus acked) within .keepaliveTimeout seconds
(which defaults to 30).
(SlaveBuilder.remote_startCommand): record activity
(SlaveBuilder.ackUpdate): same
(SlaveBuilder.ackComplete): same
(BotFactory.gotPerspective): same
* buildbot/test/test_run.py (Disconnect.testSlaveTimeout): test it


Index: bot.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/bot.py,v
retrieving revision 1.5
retrieving revision 1.6
diff -u -d -r1.5 -r1.6
--- bot.py	4 Dec 2004 21:02:03 -0000	1.5
+++ bot.py	11 Dec 2004 11:12:35 -0000	1.6
@@ -8,6 +8,7 @@
 from twisted.application import service, internet
 from twisted.cred import credentials
 
+from buildbot.util import now
 from buildbot.pbutil import ReconnectingPBClientFactory
 from buildbot.slave import registry
 # make sure the standard commands get registered
@@ -81,6 +82,12 @@
         if self.stopCommandOnShutdown:
             self.stopCommand()
 
+    def activity(self):
+        #bot = self.parent
+        #buildslave = bot.parent
+        #bf = buildslave.bf
+        self.parent.parent.bf.activity()
+
     def remote_setMaster(self, remote):
         self.remote = remote
         self.remote.notifyOnDisconnect(self.lostRemote)
@@ -110,6 +117,8 @@
         """This is called multiple times by various master-side BuildSteps,
         to start various commands that actually do the build."""
 
+        self.activity()
+
         if self.command:
             log.msg("leftover command, dropping it")
             self.stopCommand()
@@ -132,6 +141,7 @@
     def remote_interruptCommand(self, stepId, why):
         """Halt the current step."""
         log.msg("asked to interrupt current command: %s" % why)
+        self.activity()
         if not self.command:
             # TODO: just log it, a race could result in their interrupting a
             # command that wasn't actually running
@@ -175,12 +185,10 @@
             d.addErrback(self._ackFailed, "SlaveBuilder.sendUpdate")
 
     def ackUpdate(self, acknum):
-        # TODO: update the "last activity" timer
-        pass
+        self.activity() # update the "last activity" timer
 
     def ackComplete(self, dummy):
-        # TODO: update the "last activity" timer
-        pass
+        self.activity() # update the "last activity" timer
 
     def _ackFailed(self, why, where):
         log.msg("SlaveBuilder._ackFailed:", where)
@@ -283,13 +291,32 @@
         d.addCallbacks(log.msg, log.err)
 
 class BotFactory(ReconnectingPBClientFactory):
-    keepaliveTimeout = 30
+    # 'keepaliveInterval' serves two purposes. The first is to keep the
+    # connection alive: it guarantees that there will be at least some
+    # traffic once every 'keepaliveInterval' seconds, which may help keep an
+    # interposed NAT gateway from dropping the address mapping because it
+    # thinks the connection has been abandoned. The second is to put an upper
+    # limit on how long the buildmaster might have gone away before we notice
+    # it. For this second purpose, we insist upon seeing *some* evidence of
+    # the buildmaster at least once every 'keepaliveInterval' seconds.
+    keepaliveInterval = None # None = do not use keepalives
+
+    # 'keepaliveTimeout' seconds before the interval expires, we will send a
+    # keepalive request, both to add some traffic to the connection, and to
+    # prompt a response from the master in case all our builders are idle. We
+    # don't insist upon receiving a timely response from this message: a slow
+    # link might put the request at the wrong end of a large build message.
+    keepaliveTimeout = 30 # how long we will go without a response
+
+    keepaliveTimer = None
+    activityTimer = None
+    lastActivity = 0
     unsafeTracebacks = 1
 
-    def __init__(self, keepaliveInterval=0):
+    def __init__(self, keepaliveInterval, keepaliveTimeout):
         ReconnectingPBClientFactory.__init__(self)
-        self.keepaliveTimer = None
         self.keepaliveInterval = keepaliveInterval
+        self.keepaliveTimeout = keepaliveTimeout
 
     def startedConnecting(self, connector):
         ReconnectingPBClientFactory.startedConnecting(self, connector)
@@ -304,10 +331,11 @@
             log.msg("unable to set SO_KEEPALIVE")
             if not self.keepaliveInterval:
                 self.keepaliveInterval = 10*60
+        self.activity()
         if self.keepaliveInterval:
             log.msg("sending application-level keepalives every %d seconds" \
                     % self.keepaliveInterval)
-            self.startKeepaliveTimer()
+            self.startTimers()
 
     def clientConnectionFailed(self, connector, reason):
         self.connector = None
@@ -316,55 +344,74 @@
 
     def clientConnectionLost(self, connector, reason):
         self.connector = None
+        self.stopTimers()
+        self.perspective = None
         ReconnectingPBClientFactory.clientConnectionLost(self,
                                                          connector, reason)
+
+    def startTimers(self):
+        assert self.keepaliveInterval
+        assert not self.keepaliveTimer
+        assert not self.activityTimer
+        # Insist that doKeepalive fires before checkActivity. Really, it
+        # needs to happen at least one RTT beforehand.
+        assert self.keepaliveInterval > self.keepaliveTimeout
+
+        # arrange to send a keepalive a little while before our deadline
+        when = self.keepaliveInterval - self.keepaliveTimeout
+        self.keepaliveTimer = reactor.callLater(when, self.doKeepalive)
+        # and check for activity too
+        self.activityTimer = reactor.callLater(self.keepaliveInterval,
+                                               self.checkActivity)
+
+    def stopTimers(self):
         if self.keepaliveTimer:
             self.keepaliveTimer.cancel()
             self.keepaliveTimer = None
-        self.perspective = None
+        if self.activityTimer:
+            self.activityTimer.cancel()
+            self.activityTimer = None
 
-    def startKeepaliveTimer(self):
-        """Once this is called, the bot will send an application-level
-        keepalive query every self.keepaliveInterval seconds. If the master
-        has not responded within self.keepaliveTimeout seconds, the link
-        is deemed dead and we will initiate a reconnect.
-        """
-        self.keepaliveTimer = reactor.callLater(self.keepaliveInterval,
-                                                self.doKeepalive)
+    def activity(self, res=None):
+        self.lastActivity = now()
 
     def doKeepalive(self):
+        # send the keepalive request. If it fails outright, the connection
+        # was already dropped, so just log and ignore.
         self.keepaliveTimer = None
         log.msg("sending app-level keepalive")
         d = self.perspective.callRemote("keepalive")
-        d.setTimeout(self.keepaliveTimeout)
-        # TODO: when the timeout is invoked, the PBConnectionLost causes an
-        # AlreadyCalledError
+        d.addCallback(self.activity)
         d.addErrback(self.keepaliveLost)
-        self.startKeepaliveTimer()
 
-    def keepaliveLost(self, why):
-        # either the network connection was lost (why == StaleReference or
-        # something), or the network has silently gone away (perhaps a NAT
-        # box has forgotten about us) (why == TimeoutError).
-        log.msg("keepaliveLost", why)
-        self.perspective.broker.transport.loseConnection()
+    def keepaliveLost(self, f):
+        log.msg("BotFactory.keepaliveLost")
+
+    def checkActivity(self):
+        self.activityTimer = None
+        if self.lastActivity + self.keepaliveInterval < now():
+            log.msg("BotFactory.checkActivity: nothing from master for "
+                    "%d secs" % (now() - self.lastActivity))
+            self.perspective.broker.transport.loseConnection()
+            return
+        self.startTimers()
 
     def stopFactory(self):
         ReconnectingPBClientFactory.stopFactory(self)
-        if self.keepaliveTimer:
-            self.keepaliveTimer.cancel()
-            self.keepaliveTimer = None
+        self.stopTimers()
 
 
 class BuildSlave(service.MultiService):
     botClass = Bot
 
     def __init__(self, host, port, name, passwd, basedir, keepalive,
-                 usePTY):
+                 usePTY, keepaliveTimeout=30):
         service.MultiService.__init__(self)
         bot = self.botClass(basedir, usePTY)
         bot.setServiceParent(self)
-        bf = self.bf = BotFactory(keepalive)
+        if keepalive == 0:
+            keepalive = None
+        bf = self.bf = BotFactory(keepalive, keepaliveTimeout)
         bf.startLogin(credentials.UsernamePassword(name, passwd), client=bot)
         self.connection = c = internet.TCPClient(host, port, bf)
         c.setServiceParent(self)





More information about the Commits mailing list