[Buildbot-commits] buildbot/buildbot/slave interfaces.py,NONE,1.1 bot.py,1.3,1.4 commands.py,1.16,1.17

Brian Warner warner at users.sourceforge.net
Fri Dec 3 22:54:54 UTC 2004


Update of /cvsroot/buildbot/buildbot/buildbot/slave
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot/slave

Modified Files:
	bot.py commands.py 
Added Files:
	interfaces.py 
Log Message:
Make commands (and builds) interruptible. Improve lost-slave behavior.
Merging in several days of changes from local Arch branch, see ChangeLog for
details about individual files.


Index: bot.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/bot.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- bot.py	30 Aug 2004 22:15:23 -0000	1.3
+++ bot.py	3 Dec 2004 22:54:52 -0000	1.4
@@ -35,21 +35,20 @@
     def __init__(self, builder):
         self.builder = builder
     
-class SlaveBuilder(pb.Referenceable):
+class SlaveBuilder(pb.Referenceable, service.Service):
 
     """This is the local representation of a single Builder: it handles a
     single kind of build (like an all-warnings build). It has a name and a
     home directory. The rest of its behavior is determined by the master.
     """
-    
+
+    stopCommandOnShutdown = True
+
     def __init__(self, parent, name, builddir, not_really):
+        #service.Service.__init__(self)
         self.name = name
-        self.bot = parent
         self.builddir = builddir
         self.not_really = not_really
-        self.basedir = os.path.join(self.bot.basedir, builddir)
-        if not os.path.isdir(self.basedir):
-            os.mkdir(self.basedir)
         self.remote = None
         # remote is a ref to the Builder object on the master side, and is
         # set when they attach. It really isn't used very much
@@ -74,6 +73,20 @@
         
     def __repr__(self):
         return "<SlaveBuilder '%s'>" % self.name
+
+    def setServiceParent(self, parent):
+        service.Service.setServiceParent(self, parent)
+        self.bot = self.parent
+        self.basedir = os.path.join(self.bot.basedir, self.builddir)
+        if not os.path.isdir(self.basedir):
+            os.mkdir(self.basedir)
+
+    def stopService(self):
+        service.Service.stopService(self)
+        if self.command and self.stopCommandOnShutdown:
+            self.stopCommand()
+            self.command = None
+
     def remote_setMaster(self, remote):
         self.remote = remote
         self.remote.notifyOnDisconnect(self.lostRemote)
@@ -82,14 +95,13 @@
 
     def lostRemote(self, remote):
         log.msg("lost remote")
-        if self.remote != remote:
-            print "WEIRD: lost the wrong remote"
         self.remote = None
+
     def lostRemoteStep(self, remotestep):
         log.msg("lost remote step")
-        if self.remoteStep != remotestep:
-            print "WEIRD: lost the wrong remote step"
         self.remoteStep = None
+        if self.command and self.stopCommandOnShutdown:
+            self.stopCommand()
         
     # the following are Commands that can be invoked by the master-side
     # Builder
@@ -105,7 +117,7 @@
 
         if self.command:
             log.msg("leftover command, dropping it")
-            #self.stopCommand()
+            self.stopCommand()
             self.command = None
 
         try:
@@ -120,25 +132,49 @@
         self.remoteStep.notifyOnDisconnect(self.lostRemoteStep)
         self.updateNum = 0
         self.complete = None
+        self.command.running = True
         d = defer.maybeDeferred(self.command.start)
         d.addCallbacks(self.commandComplete, self.commandFailed)
         return None
 
-    # the following are invoked by the Commands we spawn
+    def remote_interruptCommand(self, stepId, why):
+        """Halt the current step."""
+        log.msg("asked to interrupt current command: %s" % why)
+        if not self.command:
+            # TODO: just log it, a race could result in their interrupting a
+            # command that wasn't actually running
+            log.msg(" .. but none was running")
+            return
+        self.command.interrupt()
+
+    def stopCommand(self):
+        if not self.command:
+            return
+        self.command.running = False
+        if not self.command.interrupted:
+            self.command.interrupt()
+        
+
+    # these two are fired by the Deferred attached to each Command
     def commandComplete(self, dummy):
+        if not self.running:
+            return
         self.sendComplete()
     def commandFailed(self, why):
+        if not self.running:
+            return
         log.msg("commandFailed")
         log.err(why)
         self.sendComplete(why)
 
-
-    # these are utility routines used by sendStatus and commandComplete
+    # sendUpdate is invoked by the Commands we spawn
     def sendUpdate(self, data=None):
         """This sends the status update to the master-side BuildStep object,
         giving it a sequence number in the process. It adds the update to
         a queue, and asks the master to acknowledge the update so it can be
         removed from that queue."""
+        if not self.running:
+            return
         self.updateNum += 1
         update = [data, self.updateNum]
         #log.msg("sendUpdate", update)
@@ -149,6 +185,8 @@
             d.addCallback(self.ackUpdate)
             d.addErrback(self._ackFailed, "SlaveBuilder.sendUpdate")
 
+    # these are utility routines used by sendStatus and commandComplete
+
     def dummy(self, value):
         pass
 
@@ -218,10 +256,6 @@
         self.sendAllUpdates()
         self.sendAllCompletes()
 
-    def stopCommand(self):
-        if self.command:
-            self.command.interrupt()
-        self.command = None
     def finishCommand(self):
         log.msg("SlaveBuilder.finishCommand", self.command)
         self.remoteStep.dontNotifyOnDisconnect(self.lostRemoteStep)
@@ -233,10 +267,12 @@
         reactor.stop()
         
         
-class Bot(pb.Referenceable):
+class Bot(pb.Referenceable, service.MultiService):
     usePTY = None
+    name = "bot"
 
     def __init__(self, basedir, usePTY, not_really=0):
+        service.MultiService.__init__(self)
         self.basedir = basedir
         self.usePTY = usePTY
         self.not_really = not_really
@@ -263,11 +299,13 @@
             else:
                 b = SlaveBuilder(self, name, builddir, self.not_really)
                 b.usePTY = self.usePTY
+                b.setServiceParent(self)
                 self.builders[name] = b
             retval[name] = b
         for name in self.builders.keys():
             if not name in map(lambda a: a[0], wanted):
                 log.msg("removing old builder %s" % name)
+                self.builder[name].disownServiceParent()
                 del(self.builders[name])
         return retval
     
@@ -370,18 +408,33 @@
             self.keepaliveTimer = None
 
 
-class BuildSlave(internet.TCPClient):
+class BuildSlave(service.MultiService):
+    botClass = Bot
+
     def __init__(self, host, port, name, passwd, basedir, keepalive,
                  usePTY):
-        bot = Bot(basedir, usePTY)
+        service.MultiService.__init__(self)
+        bot = self.botClass(basedir, usePTY)
+        bot.setServiceParent(self)
         bf = self.bf = BotFactory(keepalive)
         bf.startLogin(credentials.UsernamePassword(name, passwd), client=bot)
-        internet.TCPClient.__init__(self, host, port, bf)
+        self.connection = c = internet.TCPClient(host, port, bf)
+        c.setServiceParent(self)
+
+    def waitUntilDisconnected(self):
+        # utility method for testing. Returns a Deferred that will fire when
+        # we lose the connection to the master.
+        if not self.bf.perspective:
+            return defer.succeed(None)
+        d = defer.Deferred()
+        self.bf.perspective.notifyOnDisconnect(lambda res: d.callback(None))
+        return d
 
     def stopService(self):
         self.bf.continueTrying = 0
-        internet.TCPClient.stopService(self)
-        return self._connection.disconnect()
+        service.MultiService.stopService(self)
+        # now kill the TCP connection
+        self.connection._connection.disconnect()
 
 class Options(usage.Options):
     synopsis = "Usage: mktap buildbot slave --name <name> --passwd <passwd> [options]"

Index: commands.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/commands.py,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -d -r1.16 -r1.17
--- commands.py	28 Oct 2004 07:27:08 -0000	1.16
+++ commands.py	3 Dec 2004 22:54:52 -0000	1.17
@@ -6,10 +6,14 @@
 from twisted.internet import reactor, defer
 from twisted.python import log, failure, runtime
 
+from buildbot.slave.interfaces import ISlaveCommand
 from buildbot.slave.registry import registerSlaveCommand
 
 cvs_ver = '$Revision$'[1+len("Revision: "):-2]
 
+# version history:
+#  >=1.17: commands are interruptable
+
 class CommandInterrupted(Exception):
     pass
 class TimeoutError(Exception):
@@ -206,11 +210,13 @@
 
     def doTimeout(self):
         msg = "command timed out: %d seconds without output" % self.timeout
+        self.kill(msg)
+
+    def kill(self, msg):
         msg += ", killing pid %d" % self.process.pid
         log.msg(msg)
         self.sendStatus({'header': "\n" + msg + "\n"})
 
-        # TODO: nicer way is: SIGTERM, wait, SIGKILL, wait, freak
         hit = 0
         if runtime.platformType == "posix":
             try:
@@ -226,14 +232,15 @@
                 # probably no-such-process, maybe because there is no process
                 # group
                 pass
-        try:
-            log.msg("trying process.signalProcess('KILL')")
-            self.process.signalProcess('KILL')
-            log.msg(" successful")
-            hit = 1
-        except OSError:
-            # could be no-such-process, because they finished very recently
-            pass
+        if not hit:
+            try:
+                log.msg("trying process.signalProcess('KILL')")
+                self.process.signalProcess('KILL')
+                log.msg(" successful")
+                hit = 1
+            except OSError:
+                # could be no-such-process, because they finished very recently
+                pass
         if not hit:
             log.msg("signalProcess/os.kill failed both times")
         # finished ought to be called momentarily
@@ -248,21 +255,17 @@
         # be raised as pp tries to send status through .command
         self.commandFailed(TimeoutError("SIGKILL failed to kill process"))
 
-    def interrupt(self):
-        log.msg("interrupting process", self.process)
-        self.process.signalProcess('KILL')
-        # some stdout/stderr may be lost, along with the exit code
-
 
 class Command:
+    __implements__ = ISlaveCommand,
 
-    """This class defines one command that can be invoked by the build
-    master. The command is executed on the slave side, and always sends back
-    a completion message when it finishes. It may also send intermediate
-    status as it runs (by calling builder.sendStatus). Some commands can be
-    interrupted (either by the build master or a local SIGINT), in which
-    case the status message indicates the step failed to complete because of
-    an interruption.
+    """This class defines one command that can be invoked by the build master.
+    The command is executed on the slave side, and always sends back a
+    completion message when it finishes. It may also send intermediate status
+    as it runs (by calling builder.sendStatus). Some commands can be
+    interrupted (either by the build master or a local timeout), in which
+    case the step is expected to complete normally with a status message that
+    indicates an error occurred.
 
     These commands are used by BuildSteps on the master side. Each kind of
     BuildStep uses a single Command. The slave must implement all the
@@ -274,46 +277,75 @@
     where 'builder' is the parent SlaveBuilder object, and 'args' is a
     dict that is interpreted per-command.
 
+    The setup(args) method is available for setup, and is run from __init__.
+
     The Command is started with start(). This method must be implemented in a
     subclass, and it should return a Deferred. When your step is done, you
     should fire the Deferred (the results are not used). If the command is
-    interrupted, it should errback with a CommandInterrupted failure.
+    interrupted, it should fire the Deferred anyway.
 
-    The status messages all carry a dict, which is interpreted by the
-    master-side BuildStep however it likes. The completion message only
-    specifies whether the command was interrupted or not. If the Command
-    needs to return an exit code of some sort, that should be sent as a
-    regular status message before the completion message is sent. Once
-    builder.commandComplete has been run, no more status messages may be
-    sent."""
+    While the command runs. it may send status messages back to the
+    buildmaster by calling self.sendStatus(statusdict). The statusdict is
+    interpreted by the master-side BuildStep however it likes.
+
+    A separate completion message is sent when the deferred fires, which
+    indicates that the Command has finished, but does not carry any status
+    data. If the Command needs to return an exit code of some sort, that
+    should be sent as a regular status message before the deferred is fired .
+    Once builder.commandComplete has been run, no more status messages may be
+    sent.
+
+    If interrupt() is called, the Command should attempt to shut down as
+    quickly as possible. Child processes should be killed, new ones should
+    not be started. The Command should send some kind of error status update,
+    then complete as usual by firing the Deferred.
+
+    .interrupted should be set by interrupt(), and can be tested to avoid
+    sending multiple error status messages.
+
+    If .running is False, the bot is shutting down (or has otherwise lost the
+    connection to the master), and should not send any status messages. This
+    is taken care of in Command.sendStatus .
+
+    """
 
     # builder methods:
     #  sendStatus(dict) (zero or more)
     #  commandComplete() or commandInterrupted() (one, at end)
 
     debug = False
+    interrupted = False
+    running = False # set by Builder, cleared on shutdown or when the
+                    # Deferred fires
 
     def __init__(self, builder, stepId, args):
         self.builder = builder
         self.stepId = stepId # just for logging
         self.args = args
+        self.setup(args)
 
+    def setup(self, args):
+        """Override this in a subclass to extract items from the args dict."""
+        pass
+        
     def start(self):
         # should return a Deferred
         raise NotImplementedError, "You must implement this in a subclass"
 
     def sendStatus(self, status):
         """Send a status update to the master."""
-        if self.debug: log.msg("sendStatus", status)
+        if self.debug:
+            log.msg("sendStatus", status)
+        if not self.running:
+            log.msg("would sendStatus but not .running")
+            return
         self.builder.sendUpdate(status)
 
-    def NOTinterrupt(self):
-        """Stop the command. You must implement this in a subclass, then
-        call this parent method when it is done. After this is called, no
-        further status messages may be sent."""
-        self.builder = None # make sure we stop sending messages
-        self.interrupted = 1
-        self.deferred.errback(failure.Failure(CommandInterrupted()))
+    def interrupt(self):
+        """Override this in a subclass to allow commands to be interrupted.
+        May be called multiple times, use self.interrupted=True if this
+        matters."""
+        pass
 
     def _abandonOnFailure(self, rc):
         if type(rc) is not int:
@@ -375,6 +407,11 @@
         d = self.command.start()
         return d
 
+    def interrupt(self):
+        self.interrupted = True
+        self.command.kill("command interrupted")
+
+
 registerSlaveCommand("shell", SlaveShellCommand, cvs_ver)
 
 
@@ -382,18 +419,27 @@
     def start(self):
         self.d = defer.Deferred()
         log.msg("  starting dummy command [%s]" % self.stepId)
-        reactor.callLater(1, self.doStatus)
+        self.timer = reactor.callLater(1, self.doStatus)
         return self.d
 
+    def interrupt(self):
+        self.timer.cancel()
+        self.timer = None
+        self.interrupted = True
+        self.finished()
+
     def doStatus(self):
         log.msg("  sending intermediate status")
         self.sendStatus({'stdout': 'data'})
         timeout = self.args.get('timeout', 5) + 1
-        reactor.callLater(timeout - 1, self.finished)
+        self.timer = reactor.callLater(timeout - 1, self.finished)
 
     def finished(self):
         log.msg("  dummy command finished [%s]" % self.stepId)
-        self.sendStatus({'rc': 0})
+        if self.interrupted:
+            self.sendStatus({'rc': 1})
+        else:
+            self.sendStatus({'rc': 0})
         self.d.callback(0)
 
 registerSlaveCommand("dummy", DummyCommand, cvs_ver)
@@ -426,21 +472,19 @@
 
     """
 
-    def __init__(self, builder, stepId, args):
-        Command.__init__(self, builder, stepId, args)
+    def setup(self, args):
         self.workdir = args['workdir']
         self.mode = args.get('mode', "update")
         self.revision = args.get('revision')
         self.patch = args.get('patch')
         self.timeout = args.get('timeout', 120)
-        self.setup(args)
-
-    def setup(self, args):
-        """Override this in the VC-specific subclass to extract more args"""
-        pass
+        # VC-specific subclasses should override this to extract more args.
+        # Make sure to upcall!
 
     def start(self):
         self.sendStatus({'header': "starting " + self.header + "\n"})
+        self.command = None
+
         # self.srcdir is where the VC system should put the sources
         if self.mode == "copy":
             self.srcdir = "source" # hardwired directory name, sorry
@@ -466,6 +510,11 @@
         d.addCallbacks(self._sendRC, self._checkAbandoned)
         return d
 
+    def interrupt(self):
+        self.interrupted = True
+        if self.command:
+            self.command.kill("command interrupted")
+
     def doVC(self, res):
         if self.sourcedirIsUpdateable():
             d = self.doVCUpdate()
@@ -484,6 +533,8 @@
     def maybeDoVCFallback(self, rc):
         if type(rc) is int and rc == 0:
             return rc
+        if self.interrupted:
+            raise AbandonChain(1)
         msg = "update failed, clobbering and trying again"
         self.sendStatus({'header': msg + "\n"})
         log.msg(msg)
@@ -577,6 +628,7 @@
     header = "cvs operation"
 
     def setup(self, args):
+        SourceBase.setup(self, args)
         self.cvsroot = args['cvsroot']
         self.cvsmodule = args['cvsmodule']
         self.global_options = args.get('global_options', [])
@@ -651,6 +703,7 @@
     header = "svn operation"
 
     def setup(self, args):
+        SourceBase.setup(self, args)
         self.svnurl = args['svnurl']
 
     def sourcedirIsUpdateable(self):
@@ -695,6 +748,7 @@
     header = "darcs operation"
 
     def setup(self, args):
+        SourceBase.setup(self, args)
         self.repourl = args['repourl']
 
     def sourcedirIsUpdateable(self):
@@ -741,6 +795,7 @@
     buildconfig = None
 
     def setup(self, args):
+        SourceBase.setup(self, args)
         self.url = args['url']
         self.version = args['version']
 
@@ -825,6 +880,7 @@
     header = "p4 sync"
 
     def setup(self, args):
+        SourceBase.setup(self, args)
         self.p4port = args['p4port']
         
     def sourcedirIsUpdateable(self):

--- NEW FILE: interfaces.py ---
#! /usr/bin/python

from twisted.python.components import Interface

class ISlaveCommand(Interface):
    """This interface is implemented by all of the buildslave's Command
    subclasses. It specifies how the buildslave can start, interrupt, and
    query the various Commands running on behalf of the buildmaster."""

    def __init__(builder, stepId, args):
        """Create the Command. 'builder' is a reference to the parent
        buildbot.bot.SlaveBuilder instance, which will be used to send status
        updates (by calling builder.sendStatus). 'stepId' is a random string
        which helps correlate slave logs with the master. 'args' is a dict of
        arguments that comes from the master-side BuildStep, with contents
        that are specific to the individual Command subclass.

        This method is not intended to be subclassed."""

    def setup(args):
        """This method is provided for subclasses to override, to extract
        parameters from the 'args' dictionary. The default implemention does
        nothing. It will be called from __init__"""

    def start():
        """Begin the command, and return a Deferred.

        While the command runs, it should send status updates to the
        master-side BuildStep by calling self.sendStatus(status). The
        'status' argument is typically a dict with keys like 'stdout',
        'stderr', and 'rc'.

        When the step completes, it should fire the Deferred (the results are
        not used). If an exception occurs during execution, it may also
        errback the deferred, however any reasonable errors should be trapped
        and indicated with a non-zero 'rc' status rather than raising an
        exception. Exceptions should indicate problems within the buildbot
        itself, not problems in the project being tested.

        """

    def interrupt():
        """This is called to tell the Command that the build is being stopped
        and therefore the command should be terminated as quickly as
        possible. The command may continue to send status updates, up to and
        including an 'rc' end-of-command update (which should indicate an
        error condition). The Command's deferred should still be fired when
        the command has finally completed.

        If the build is being stopped because the slave it shutting down or
        because the connection to the buildmaster has been lost, the status
        updates will simply be discarded. The Command does not need to be
        aware of this.

        Child shell processes should be killed. Simple ShellCommand classes
        can just insert a header line indicating that the process will be
        killed, then os.kill() the child."""





More information about the Commits mailing list