[Buildbot-commits] buildbot/buildbot/process base.py,1.43,1.44 builder.py,1.18,1.19 step.py,1.56,1.57

Brian Warner warner at users.sourceforge.net
Fri Dec 3 22:54:54 UTC 2004


Update of /cvsroot/buildbot/buildbot/buildbot/process
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot/process

Modified Files:
	base.py builder.py step.py 
Log Message:
Make commands (and builds) interruptible. Improve lost-slave behavior.
Merging in several days of changes from local Arch branch, see ChangeLog for
details about individual files.


Index: base.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/process/base.py,v
retrieving revision 1.43
retrieving revision 1.44
diff -u -d -r1.43 -r1.44
--- base.py	30 Sep 2004 07:15:35 -0000	1.43
+++ base.py	3 Dec 2004 22:54:51 -0000	1.44
@@ -5,7 +5,7 @@
 
 from twisted.python import log, components
 from twisted.python.failure import Failure
-from twisted.internet import reactor, defer
+from twisted.internet import reactor, defer, error
 from twisted.spread import pb
 import twisted.web.util
 
@@ -221,8 +221,10 @@
         first Step. It returns a Deferred which will fire when the build
         finishes."""
 
+        log.msg("%s.startBuild" % self)
         self.build_status = build_status
         self.remote = remote
+        self.remote.notifyOnDisconnect(self.lostRemote)
         self.deferred = defer.Deferred()
 
         try:
@@ -354,6 +356,8 @@
         self.results.append(result)
         if text:
             self.text.extend(text)
+        if not self.remote:
+            terminate = True
         if result == FAILURE:
             if step.warnOnFailure:
                 if self.result != FAILURE:
@@ -371,22 +375,38 @@
                 self.result = FAILURE
         return terminate
 
+    def lostRemote(self, remote):
+        # the slave went away. There are several possible reasons for this,
+        # and they aren't necessarily fatal. For now, kill the build, but
+        # TODO: see if we can resume the build when it reconnects.
+        log.msg("%s.lostRemote" % self)
+        self.remote = None
+        if self.currentStep:
+            # this should cause the step to finish.
+            log.msg(" stopping currentStep", self.currentStep)
+            self.currentStep.interrupt(Failure(error.ConnectionLost()))
+
     def stopBuild(self, reason):
-        # the idea here is to let the user cancel a build because, e.g., they
-        # realized they committed a bug and they don't want to waste the time
-        # building something that they know will fail. Another reason might
-        # be to abandon a stuck build. We want to mark the build as failed
-        # quickly rather than waiting for it to die on its own.
+        # the idea here is to let the user cancel a build because, e.g.,
+        # they realized they committed a bug and they don't want to waste
+        # the time building something that they know will fail. Another
+        # reason might be to abandon a stuck build. We want to mark the
+        # build as failed quickly rather than waiting for the slave's
+        # timeout to kill it on its own.
 
         log.msg(" %s: stopping build: %s" % (self, reason))
-        assert not self.finished
-        #self.currentStep.stop(reason)
-        # TODO: maybe let its deferred do buildFinished
-        if self.currentStep and self.currentStep.progress:
-            # XXX: really .fail or something
-            self.currentStep.progress.finish()
-        text = ["stopped", reason]
-        self.buildFinished(text, "red", FAILURE)
+        if self.finished:
+            return
+        # TODO: include 'reason' in this point event
+        self.builder.builder_status.addPointEvent(['interrupt'])
+        self.currentStep.interrupt(reason)
+        if 0:
+            # TODO: maybe let its deferred do buildFinished
+            if self.currentStep and self.currentStep.progress:
+                # XXX: really .fail or something
+                self.currentStep.progress.finish()
+            text = ["stopped", reason]
+            self.buildFinished(text, "red", FAILURE)
 
     def allStepsDone(self):
         if self.result == FAILURE:
@@ -419,6 +439,8 @@
         abandoned."""
 
         self.finished = True
+        if self.remote:
+            self.remote.dontNotifyOnDisconnect(self.lostRemote)
         self.results = results
 
         log.msg(" %s: build finished" % self)
@@ -439,5 +461,9 @@
 
 class BuildControl(components.Adapter):
     __implements__ = interfaces.IBuildControl,
+
     def getStatus(self):
         return self.original.build_status
+
+    def stopBuild(self, reason="<no reason given>"):
+        self.original.stopBuild(reason)

Index: builder.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/process/builder.py,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -d -r1.18 -r1.19
--- builder.py	7 Nov 2004 20:32:47 -0000	1.18
+++ builder.py	3 Dec 2004 22:54:51 -0000	1.19
@@ -1,8 +1,8 @@
 #! /usr/bin/python
 
-from twisted.python import log, components
+from twisted.python import log, components, failure
 from twisted.spread import pb
-from twisted.internet import reactor
+from twisted.internet import reactor, defer
 
 from buildbot import interfaces
 from buildbot.status.progress import Expectations
@@ -172,6 +172,7 @@
 
     def detached(self):
         """This is called when the connection to the bot is lost."""
+        log.msg("%s.detached" % self)
         self.remote = None
         reactor.callLater(0, self._detached)
         # the current step will be stopped (via a notifyOnDisconnect
@@ -179,6 +180,7 @@
 
     def _detached(self):
         if self.currentBuild:
+            log.msg("%s._detached: killing build" % self)
             # wasn't enough
             self.currentBuild.stopBuild("slave lost")
             self.currentBuild = None
@@ -298,7 +300,8 @@
 
     def startBuild(self, build):
         log.msg("starting build %s" % build)
-        self.remote.callRemote("startBuild") # informational courtesy
+        d = self.remote.callRemote("startBuild") # informational courtesy
+        d.addErrback(self._startBuildFailed, build)
 
         # create the BuildStatus object that goes with the Build
         bs = self.builder_status.newBuild()
@@ -309,9 +312,14 @@
         # Finally it will start the actual build process.
         d = build.startBuild(bs, self.expectations, self.remote)
         d.addCallback(self.buildFinished)
+        d.addErrback(log.err)
         control = base.BuildControl(build)
         return control
 
+    def _startBuildFailed(self, why, build):
+        log.msg("wanted to start build %s, but "
+                "remote_startBuild failed: %s" % (build, why))
+
     def testsFinished(self, results):
         # XXX: add build number, datestamp, Change information
         #self.testTracker.testsFinished(results)
@@ -365,6 +373,50 @@
             self.remote.callRemote("shutdown")
             
 
+class Ping:
+    def ping(self, status, remote, timeout):
+        if not remote:
+            status.addPointEvent(["ping", "no slave"], "red")
+            return defer.succeed(False) # interfaces.NoSlaveError
+        self.event = status.addEvent(["pinging"], "yellow")
+        self.active = True
+        self.d = defer.Deferred()
+        d = remote.callRemote("print", "ping")
+        d.addBoth(self._pong)
+
+        # We use either our own timeout or the (long) TCP timeout to detect
+        # silently-missing slaves. This might happen because of a NAT
+        # timeout or a routing loop. If the slave just shuts down (and we
+        # somehow missed the FIN), we should get a "connection refused"
+        # message.
+        self.timer = reactor.callLater(timeout, self.timeout)
+        return self.d
+
+    def timeout(self):
+        self.timer = None
+        self._pong(failure.Failure(interfaces.NoSlaveError("timeout")))
+
+    def _pong(self, res):
+        if not self.active:
+            return
+        self.active = False
+        if self.timer:
+            self.timer.cancel()
+        e = self.event
+        if isinstance(res, failure.Failure):
+            e.text = ["ping", "failed"]
+            e.color = "red"
+            ponged = False
+            # TODO: force the BotPerspective to disconnect, since this
+            # indicates that the bot is unreachable. That will also append a
+            # "disconnect" event to the builder_status, terminating this
+            # "ping failed" event.
+        else:
+            e.text = ["ping", "success"]
+            e.color = "green"
+            ponged = True
+        e.finish()
+        self.d.callback(ponged)
 
 class BuilderControl(components.Adapter):
     __implements__ = interfaces.IBuilderControl,
@@ -372,39 +424,15 @@
         bc = self.original.forceBuild(who, reason)
         return bc
 
-    def ping(self, wait=False):
-        status = self.original.builder_status
-        if not self.original.remote:
-            status.addPointEvent(["ping", "no slave"], "red")
-            if wait:
-                return defer.fail(interfaces.NoSlaveError())
-        else:
-            # we rely upon the TCP timeout to detect silently-missing
-            # slaves. This might happen because of a NAT timeout or a
-            # routing loop. If the slave just shuts down, we should get a
-            # "connection refused" message. Of course, in that case we
-            # should have gotten one for the connection anyway, but
-            # sometimes things get lost.
-            e = status.addEvent(["pinging"], "yellow")
-            d = self.original.remote.callRemote("print", "ping")
-            d.addCallback(self._pong, e)
-            d.addErrback(self._pong_failed, e, wait)
-            if wait:
-                return d
-
-    def _pong(self, res, e):
-        e.text = ["ping", "success"]
-        e.color = "green"
-        e.finish()
+    def getBuild(self, number):
+        b =  self.original.currentBuild
+        if b and b.build_status.number == number:
+            return base.BuildControl(b)
+        return None
 
-    def _pong_failed(self, why, e, wait):
-        e.text = ["ping", "failed"]
-        e.color = "red"
-        e.finish()
-        # TODO: force the BotPerspective to disconnect, since this indicates
-        # that the bot is unreachable. That will also append a "disconnect"
-        # event to the builder_status, terminating this "ping failed" event.
-        if wait:
-            raise interfaces.NoSlaveError()
+    def ping(self, timeout=30):
+        d = Ping().ping(self.original.builder_status,
+                        self.original.remote, timeout)
+        return d
 
 components.registerAdapter(BuilderControl, Builder, interfaces.IBuilderControl)

Index: step.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/process/step.py,v
retrieving revision 1.56
retrieving revision 1.57
diff -u -d -r1.56 -r1.57
--- step.py	28 Oct 2004 07:25:30 -0000	1.56
+++ step.py	3 Dec 2004 22:54:51 -0000	1.57
@@ -27,8 +27,7 @@
     SlaveCommands registered in the buildslave, and self.args to a dictionary
     of arguments that will be passed to the SlaveCommand instance.
 
-    start, remoteUpdate, remoteComplete, and remoteFailed are available to be
-    overridden
+    start, remoteUpdate, and remoteComplete are available to be overridden
 
     """
 
@@ -41,6 +40,14 @@
         self.remote_command = remote_command
         self.args = args
 
+    def __getstate__(self):
+        dict = self.__dict__.copy()
+        # Remove the remote ref: if necessary (only for resumed builds), it
+        # will be reattached at resume time
+        if dict.has_key("remote"):
+            del dict["remote"]
+        return dict
+
     def run(self, step, remote):
         self.active = True
         self.step = step
@@ -51,40 +58,40 @@
         self.commandID = "%d" % c
         log.msg("%s: RemoteCommand.run [%s]" % (self, self.commandID))
         self.deferred = defer.Deferred()
+
         d = defer.maybeDeferred(self.start)
-        d.addErrback(self._remoteFailed) # will catch unknown commands
-        return self.deferred
 
-    def __getstate__(self):
-        dict = self.__dict__.copy()
-        # Remove the remote ref: if necessary (only for resumed builds), it
-        # will be reattached at resume time
-        if dict.has_key("remote"):
-            del dict["remote"]
-        return dict
+        # _finished is called with an error for unknown commands, errors
+        # that occur while the command is starting (including OSErrors in
+        # exec()), StaleBroker (when the connection was lost before we
+        # started), and pb.PBConnectionLost (when the slave isn't responding
+        # over this connection, perhaps it had a power failure, or NAT
+        # weirdness). If this happens, self.deferred is fired right away.
+        d.addErrback(self._finished)
+
+        # Connections which are lost while the command is running are caught
+        # when our parent Step calls our .lostRemote() method.
+        return self.deferred
 
     def start(self):
         # We will receive remote_update messages as the command runs.
         # We will get a single remote_complete when it finishes.
         # We should fire self.deferred when the command is done.
-        self.remote.notifyOnDisconnect(self.disconnect)
         d = self.remote.callRemote("startCommand", self, self.commandID,
                                    self.remote_command, self.args)
         return d
 
-    def disconnect(self, broker):
-        # lost the slave: fail the command
-        log.msg("RemoteCommand.disconnect: lost slave", self)
-        self.active = False
-        self._remoteFailed(Failure(error.ConnectionLost()))
+    def interrupt(self, why):
+        if isinstance(why, Failure) and why.check(error.ConnectionLost):
+            log.msg("RemoteCommand.disconnect: lost slave", self)
+            self.remote = None
+            self._finished(Failure(error.ConnectionLost()))
+            return
 
-    def todo_stop(self):
         # tell the remote command to halt. Returns a Deferred that will fire
-        # when the command has been stopped, or will errback if the slave is
-        # unreachable.
+        # when the interrupt command has been delivered.
         d = defer.maybeDeferred(self.remote.callRemote,
-                                "stopCommand", self, self.commandID,
-                                reason)
+                                "interruptCommand", self.commandID, why)
         return d
 
     def remote_update(self, updates):
@@ -96,7 +103,7 @@
                     self.remoteUpdate(update)
             except:
                 # log failure, terminate build, let slave retire the update
-                self.failed(Failure())
+                self._finished(Failure())
                 # TODO: what if multiple updates arrive? should
                 # skip the rest but ack them all
             if num > max_updatenum:
@@ -109,51 +116,45 @@
     def remote_complete(self, failure=None):
         # call the real remoteComplete a moment later, but first return an
         # acknowledgement so the slave can retire the completion message.
-        self.remote.dontNotifyOnDisconnect(self.disconnect)
         if self.active:
-            reactor.callLater(0, self._remoteComplete, failure)
+            reactor.callLater(0, self._finished, failure)
         return None
 
-    def _remoteComplete(self, failure):
-        if failure:
-            return self._remoteFailed(failure)
-        try:
-            self.remoteComplete()
-        except:
-            # log the failure and terminate the step
-            log.msg("remoteComplete had exception")
-            return self.failed(Failure())
-        self.finished()
-
-    def _remoteFailed(self, failure):
-        log.msg("RemoteCommand._remoteFailed")
-        try:
-            self.remote.dontNotifyOnDisconnect(self.disconnect)
-        except ValueError:
-            # TODO: make this cleaner but keep it safe
-            pass # probably already removed it in remote_complete.
-        try:
-            self.remoteFailed(failure)
-        except:
-            log.msg("RemoteCommand.remoteFailed failed")
-            log.err()
-        return self.failed(failure)
+    def _finished(self, failure=None):
+        self.active = False
+        # call .remoteComplete. If it raises an exception, or returns the
+        # Failure that we gave it, our self.deferred will be errbacked. If
+        # it does not (either it ate the Failure or there the step finished
+        # normally and it didn't raise a new exception), self.deferred will
+        # be callbacked.
+        d = defer.maybeDeferred(self.remoteComplete, failure)
+        # arrange for the callback to get this RemoteCommand instance
+        # instead of just None
+        d.addCallback(lambda r: self)
+        d.addBoth(self.deferred.callback)
 
-    def remoteComplete(self):
-        # subclasses should interpret status as they like and do cleanup
-        pass
+    def remoteComplete(self, maybeFailure):
+        """Subclasses can override this.
 
-    def remoteFailed(self, why):
-        # subclasses should do any cleanup (like closing log files) here
-        pass
+        This is called when the RemoteCommand has finished. 'maybeFailure'
+        will be None if the command completed normally, or a Failure
+        instance in one of the following situations:
 
-    def finished(self):
-        self.active = False
-        self.deferred.callback(self)
+        #  the slave was lost before the command was started
+        #  the slave didn't respond to the startCommand message
+        #  the slave raised an exception while starting the command
+        #   (bad command name, bad args, OSError from missing executable)
+        #  the slave raised an exception while finishing the command
+        #   (they send back a remote_complete message with a Failure payload)
+        # and also (for now):
+        #  slave disconnected while the command was running
+        
+        This method should do cleanup, like closing log files. It should
+        normally return the 'failure' argument, so that any exceptions will
+        be propagated to the Step. If it wants to consume them, return None
+        instead."""
 
-    def failed(self, why):
-        self.active = False
-        self.deferred.errback(why)
+        return failure
 
 class LoggedRemoteCommand(RemoteCommand):
     """This is a RemoteCommand which expects the slave to send back
@@ -203,15 +204,14 @@
             log.msg("%s rc=%s" % (self, rc))
             self.addHeader("program finished with exit code %d\n" % rc)
 
-    def remoteComplete(self):
-        if self.closeWhenFinished:
-            log.msg("closing log")
-            self.log.finish()
-
-    def remoteFailed(self, why):
+    def remoteComplete(self, maybeFailure):
         if self.closeWhenFinished:
-            self.addHeader("\nremoteFailed: %s" % why)
+            if maybeFailure:
+                self.addHeader("\nremoteFailed: %s" % maybeFailure)
+            else:
+                log.msg("closing log")
             self.log.finish()
+        return maybeFailure
 
 class RemoteShellCommand(LoggedRemoteCommand):
     """This class helps you run a shell command on the build slave. It will
@@ -426,6 +426,15 @@
         
         raise NotImplementedError("your subclass must implement this method")
 
+    def interrupt(self, reason):
+        """Halt the command, either because the user has decided to cancel
+        the build ('reason' is a string), or because the slave has
+        disconnected ('reason' is a ConnectionLost Failure). Any further
+        local processing should be skipped, and the Step completed with an
+        error status. The results text should say something useful like
+        ['step', 'interrupted'] or ['remote', 'lost']"""
+        pass
+
     def finished(self, results):
         if self.progress:
             self.progress.finish()
@@ -563,9 +572,24 @@
         self.cmd.useLog(loog, True)
         loog.logProgressTo(self.progress, "output")
         d = self.runCommand(self.cmd)
-        d.addCallback(self._commandComplete)
+        d.addCallbacks(self._commandComplete, self.checkDisconnect)
         d.addErrback(self.failed)
 
+    def interrupt(self, reason):
+        # TODO: consider adding an INTERRUPTED or STOPPED status to use
+        # instead of FAILURE, might make the text a bit more clear
+        self.addCompleteLog('interrupt', reason)
+        d = self.cmd.interrupt(reason)
+        return d
+
+    def checkDisconnect(self, f):
+        f.trap(error.ConnectionLost)
+        self.step_status.setColor("red")
+        self.step_status.setText(self.describe(True) +
+                                 ["failed", "slave", "lost"])
+        self.step_status.setText2(["failed", "slave", "lost"])
+        return self.finished(FAILURE)
+
     def _commandComplete(self, cmd):
         self.commandComplete(cmd)
         self.createSummary(cmd.log)
@@ -1106,33 +1130,44 @@
     @param timeout: the number of seconds to delay
     """
 
+    haltOnFailure = True
     name = "dummy"
 
     def __init__(self, timeout=5, **kwargs):
         BuildStep.__init__(self, **kwargs)
         self.timeout = timeout
+        self.timer = None
+
     def start(self):
         self.step_status.setColor("yellow")
         self.step_status.setText(["delay", "%s secs" % self.timeout])
-        reactor.callLater(self.timeout, self._done)
-    def _done(self):
+        self.timer = reactor.callLater(self.timeout, self.done)
+
+    def interrupt(self, reason):
+        if self.timer:
+            self.timer.cancel()
+            self.timer = None
+            self.step_status.setColor("red")
+            self.step_status.setText(["delay", "interrupted"])
+            self.finished(FAILURE)
+
+    def done(self):
         self.step_status.setColor("green")
         self.finished(SUCCESS)
 
-class FailingDummy(BuildStep):
+class FailingDummy(Dummy):
     """I am a dummy step that raises an Exception after 5 seconds
     @param timeout: the number of seconds to delay
     """
 
     name = "failing dummy"
-    def __init__(self, timeout=5, **kwargs):
-        BuildStep.__init__(self, **kwargs)
-        self.timeout = timeout
+
     def start(self):
         self.step_status.setColor("yellow")
         self.step_status.setText(["boom", "%s secs" % self.timeout])
-        reactor.callLater(self.timeout, self.boom)
-    def boom(self):
+        self.timer = reactor.callLater(self.timeout, self.done)
+
+    def done(self):
         class Boom(Exception):
             pass
         try:
@@ -1150,6 +1185,7 @@
     """
 
     name = "remote dummy"
+
     def __init__(self, timeout=5, **kwargs):
         BuildStep.__init__(self, **kwargs)
         args = {'timeout': timeout}





More information about the Commits mailing list