[Buildbot-commits] buildbot/buildbot/process base.py,1.43,1.44 builder.py,1.18,1.19 step.py,1.56,1.57
Brian Warner
warner at users.sourceforge.net
Fri Dec 3 22:54:54 UTC 2004
- Previous message (by thread): [Buildbot-commits] buildbot/buildbot interfaces.py,1.20,1.21 master.py,1.54,1.55
- Next message (by thread): [Buildbot-commits] buildbot/buildbot/slave interfaces.py,NONE,1.1 bot.py,1.3,1.4 commands.py,1.16,1.17
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
Update of /cvsroot/buildbot/buildbot/buildbot/process
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot/process
Modified Files:
base.py builder.py step.py
Log Message:
Make commands (and builds) interruptible. Improve lost-slave behavior.
Merging in several days of changes from local Arch branch, see ChangeLog for
details about individual files.
Index: base.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/process/base.py,v
retrieving revision 1.43
retrieving revision 1.44
diff -u -d -r1.43 -r1.44
--- base.py 30 Sep 2004 07:15:35 -0000 1.43
+++ base.py 3 Dec 2004 22:54:51 -0000 1.44
@@ -5,7 +5,7 @@
from twisted.python import log, components
from twisted.python.failure import Failure
-from twisted.internet import reactor, defer
+from twisted.internet import reactor, defer, error
from twisted.spread import pb
import twisted.web.util
@@ -221,8 +221,10 @@
first Step. It returns a Deferred which will fire when the build
finishes."""
+ log.msg("%s.startBuild" % self)
self.build_status = build_status
self.remote = remote
+ self.remote.notifyOnDisconnect(self.lostRemote)
self.deferred = defer.Deferred()
try:
@@ -354,6 +356,8 @@
self.results.append(result)
if text:
self.text.extend(text)
+ if not self.remote:
+ terminate = True
if result == FAILURE:
if step.warnOnFailure:
if self.result != FAILURE:
@@ -371,22 +375,38 @@
self.result = FAILURE
return terminate
+ def lostRemote(self, remote):
+ # the slave went away. There are several possible reasons for this,
+ # and they aren't necessarily fatal. For now, kill the build, but
+ # TODO: see if we can resume the build when it reconnects.
+ log.msg("%s.lostRemote" % self)
+ self.remote = None
+ if self.currentStep:
+ # this should cause the step to finish.
+ log.msg(" stopping currentStep", self.currentStep)
+ self.currentStep.interrupt(Failure(error.ConnectionLost()))
+
def stopBuild(self, reason):
- # the idea here is to let the user cancel a build because, e.g., they
- # realized they committed a bug and they don't want to waste the time
- # building something that they know will fail. Another reason might
- # be to abandon a stuck build. We want to mark the build as failed
- # quickly rather than waiting for it to die on its own.
+ # the idea here is to let the user cancel a build because, e.g.,
+ # they realized they committed a bug and they don't want to waste
+ # the time building something that they know will fail. Another
+ # reason might be to abandon a stuck build. We want to mark the
+ # build as failed quickly rather than waiting for the slave's
+ # timeout to kill it on its own.
log.msg(" %s: stopping build: %s" % (self, reason))
- assert not self.finished
- #self.currentStep.stop(reason)
- # TODO: maybe let its deferred do buildFinished
- if self.currentStep and self.currentStep.progress:
- # XXX: really .fail or something
- self.currentStep.progress.finish()
- text = ["stopped", reason]
- self.buildFinished(text, "red", FAILURE)
+ if self.finished:
+ return
+ # TODO: include 'reason' in this point event
+ self.builder.builder_status.addPointEvent(['interrupt'])
+ self.currentStep.interrupt(reason)
+ if 0:
+ # TODO: maybe let its deferred do buildFinished
+ if self.currentStep and self.currentStep.progress:
+ # XXX: really .fail or something
+ self.currentStep.progress.finish()
+ text = ["stopped", reason]
+ self.buildFinished(text, "red", FAILURE)
def allStepsDone(self):
if self.result == FAILURE:
@@ -419,6 +439,8 @@
abandoned."""
self.finished = True
+ if self.remote:
+ self.remote.dontNotifyOnDisconnect(self.lostRemote)
self.results = results
log.msg(" %s: build finished" % self)
@@ -439,5 +461,9 @@
class BuildControl(components.Adapter):
__implements__ = interfaces.IBuildControl,
+
def getStatus(self):
return self.original.build_status
+
+ def stopBuild(self, reason="<no reason given>"):
+ self.original.stopBuild(reason)
Index: builder.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/process/builder.py,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -d -r1.18 -r1.19
--- builder.py 7 Nov 2004 20:32:47 -0000 1.18
+++ builder.py 3 Dec 2004 22:54:51 -0000 1.19
@@ -1,8 +1,8 @@
#! /usr/bin/python
-from twisted.python import log, components
+from twisted.python import log, components, failure
from twisted.spread import pb
-from twisted.internet import reactor
+from twisted.internet import reactor, defer
from buildbot import interfaces
from buildbot.status.progress import Expectations
@@ -172,6 +172,7 @@
def detached(self):
"""This is called when the connection to the bot is lost."""
+ log.msg("%s.detached" % self)
self.remote = None
reactor.callLater(0, self._detached)
# the current step will be stopped (via a notifyOnDisconnect
@@ -179,6 +180,7 @@
def _detached(self):
if self.currentBuild:
+ log.msg("%s._detached: killing build" % self)
# wasn't enough
self.currentBuild.stopBuild("slave lost")
self.currentBuild = None
@@ -298,7 +300,8 @@
def startBuild(self, build):
log.msg("starting build %s" % build)
- self.remote.callRemote("startBuild") # informational courtesy
+ d = self.remote.callRemote("startBuild") # informational courtesy
+ d.addErrback(self._startBuildFailed, build)
# create the BuildStatus object that goes with the Build
bs = self.builder_status.newBuild()
@@ -309,9 +312,14 @@
# Finally it will start the actual build process.
d = build.startBuild(bs, self.expectations, self.remote)
d.addCallback(self.buildFinished)
+ d.addErrback(log.err)
control = base.BuildControl(build)
return control
+ def _startBuildFailed(self, why, build):
+ log.msg("wanted to start build %s, but "
+ "remote_startBuild failed: %s" % (build, why))
+
def testsFinished(self, results):
# XXX: add build number, datestamp, Change information
#self.testTracker.testsFinished(results)
@@ -365,6 +373,50 @@
self.remote.callRemote("shutdown")
+class Ping:
+ def ping(self, status, remote, timeout):
+ if not remote:
+ status.addPointEvent(["ping", "no slave"], "red")
+ return defer.succeed(False) # interfaces.NoSlaveError
+ self.event = status.addEvent(["pinging"], "yellow")
+ self.active = True
+ self.d = defer.Deferred()
+ d = remote.callRemote("print", "ping")
+ d.addBoth(self._pong)
+
+ # We use either our own timeout or the (long) TCP timeout to detect
+ # silently-missing slaves. This might happen because of a NAT
+ # timeout or a routing loop. If the slave just shuts down (and we
+ # somehow missed the FIN), we should get a "connection refused"
+ # message.
+ self.timer = reactor.callLater(timeout, self.timeout)
+ return self.d
+
+ def timeout(self):
+ self.timer = None
+ self._pong(failure.Failure(interfaces.NoSlaveError("timeout")))
+
+ def _pong(self, res):
+ if not self.active:
+ return
+ self.active = False
+ if self.timer:
+ self.timer.cancel()
+ e = self.event
+ if isinstance(res, failure.Failure):
+ e.text = ["ping", "failed"]
+ e.color = "red"
+ ponged = False
+ # TODO: force the BotPerspective to disconnect, since this
+ # indicates that the bot is unreachable. That will also append a
+ # "disconnect" event to the builder_status, terminating this
+ # "ping failed" event.
+ else:
+ e.text = ["ping", "success"]
+ e.color = "green"
+ ponged = True
+ e.finish()
+ self.d.callback(ponged)
class BuilderControl(components.Adapter):
__implements__ = interfaces.IBuilderControl,
@@ -372,39 +424,15 @@
bc = self.original.forceBuild(who, reason)
return bc
- def ping(self, wait=False):
- status = self.original.builder_status
- if not self.original.remote:
- status.addPointEvent(["ping", "no slave"], "red")
- if wait:
- return defer.fail(interfaces.NoSlaveError())
- else:
- # we rely upon the TCP timeout to detect silently-missing
- # slaves. This might happen because of a NAT timeout or a
- # routing loop. If the slave just shuts down, we should get a
- # "connection refused" message. Of course, in that case we
- # should have gotten one for the connection anyway, but
- # sometimes things get lost.
- e = status.addEvent(["pinging"], "yellow")
- d = self.original.remote.callRemote("print", "ping")
- d.addCallback(self._pong, e)
- d.addErrback(self._pong_failed, e, wait)
- if wait:
- return d
-
- def _pong(self, res, e):
- e.text = ["ping", "success"]
- e.color = "green"
- e.finish()
+ def getBuild(self, number):
+ b = self.original.currentBuild
+ if b and b.build_status.number == number:
+ return base.BuildControl(b)
+ return None
- def _pong_failed(self, why, e, wait):
- e.text = ["ping", "failed"]
- e.color = "red"
- e.finish()
- # TODO: force the BotPerspective to disconnect, since this indicates
- # that the bot is unreachable. That will also append a "disconnect"
- # event to the builder_status, terminating this "ping failed" event.
- if wait:
- raise interfaces.NoSlaveError()
+ def ping(self, timeout=30):
+ d = Ping().ping(self.original.builder_status,
+ self.original.remote, timeout)
+ return d
components.registerAdapter(BuilderControl, Builder, interfaces.IBuilderControl)
Index: step.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/process/step.py,v
retrieving revision 1.56
retrieving revision 1.57
diff -u -d -r1.56 -r1.57
--- step.py 28 Oct 2004 07:25:30 -0000 1.56
+++ step.py 3 Dec 2004 22:54:51 -0000 1.57
@@ -27,8 +27,7 @@
SlaveCommands registered in the buildslave, and self.args to a dictionary
of arguments that will be passed to the SlaveCommand instance.
- start, remoteUpdate, remoteComplete, and remoteFailed are available to be
- overridden
+ start, remoteUpdate, and remoteComplete are available to be overridden
"""
@@ -41,6 +40,14 @@
self.remote_command = remote_command
self.args = args
+ def __getstate__(self):
+ dict = self.__dict__.copy()
+ # Remove the remote ref: if necessary (only for resumed builds), it
+ # will be reattached at resume time
+ if dict.has_key("remote"):
+ del dict["remote"]
+ return dict
+
def run(self, step, remote):
self.active = True
self.step = step
@@ -51,40 +58,40 @@
self.commandID = "%d" % c
log.msg("%s: RemoteCommand.run [%s]" % (self, self.commandID))
self.deferred = defer.Deferred()
+
d = defer.maybeDeferred(self.start)
- d.addErrback(self._remoteFailed) # will catch unknown commands
- return self.deferred
- def __getstate__(self):
- dict = self.__dict__.copy()
- # Remove the remote ref: if necessary (only for resumed builds), it
- # will be reattached at resume time
- if dict.has_key("remote"):
- del dict["remote"]
- return dict
+ # _finished is called with an error for unknown commands, errors
+ # that occur while the command is starting (including OSErrors in
+ # exec()), StaleBroker (when the connection was lost before we
+ # started), and pb.PBConnectionLost (when the slave isn't responding
+ # over this connection, perhaps it had a power failure, or NAT
+ # weirdness). If this happens, self.deferred is fired right away.
+ d.addErrback(self._finished)
+
+ # Connections which are lost while the command is running are caught
+ # when our parent Step calls our .lostRemote() method.
+ return self.deferred
def start(self):
# We will receive remote_update messages as the command runs.
# We will get a single remote_complete when it finishes.
# We should fire self.deferred when the command is done.
- self.remote.notifyOnDisconnect(self.disconnect)
d = self.remote.callRemote("startCommand", self, self.commandID,
self.remote_command, self.args)
return d
- def disconnect(self, broker):
- # lost the slave: fail the command
- log.msg("RemoteCommand.disconnect: lost slave", self)
- self.active = False
- self._remoteFailed(Failure(error.ConnectionLost()))
+ def interrupt(self, why):
+ if isinstance(why, Failure) and why.check(error.ConnectionLost):
+ log.msg("RemoteCommand.disconnect: lost slave", self)
+ self.remote = None
+ self._finished(Failure(error.ConnectionLost()))
+ return
- def todo_stop(self):
# tell the remote command to halt. Returns a Deferred that will fire
- # when the command has been stopped, or will errback if the slave is
- # unreachable.
+ # when the interrupt command has been delivered.
d = defer.maybeDeferred(self.remote.callRemote,
- "stopCommand", self, self.commandID,
- reason)
+ "interruptCommand", self.commandID, why)
return d
def remote_update(self, updates):
@@ -96,7 +103,7 @@
self.remoteUpdate(update)
except:
# log failure, terminate build, let slave retire the update
- self.failed(Failure())
+ self._finished(Failure())
# TODO: what if multiple updates arrive? should
# skip the rest but ack them all
if num > max_updatenum:
@@ -109,51 +116,45 @@
def remote_complete(self, failure=None):
# call the real remoteComplete a moment later, but first return an
# acknowledgement so the slave can retire the completion message.
- self.remote.dontNotifyOnDisconnect(self.disconnect)
if self.active:
- reactor.callLater(0, self._remoteComplete, failure)
+ reactor.callLater(0, self._finished, failure)
return None
- def _remoteComplete(self, failure):
- if failure:
- return self._remoteFailed(failure)
- try:
- self.remoteComplete()
- except:
- # log the failure and terminate the step
- log.msg("remoteComplete had exception")
- return self.failed(Failure())
- self.finished()
-
- def _remoteFailed(self, failure):
- log.msg("RemoteCommand._remoteFailed")
- try:
- self.remote.dontNotifyOnDisconnect(self.disconnect)
- except ValueError:
- # TODO: make this cleaner but keep it safe
- pass # probably already removed it in remote_complete.
- try:
- self.remoteFailed(failure)
- except:
- log.msg("RemoteCommand.remoteFailed failed")
- log.err()
- return self.failed(failure)
+ def _finished(self, failure=None):
+ self.active = False
+ # call .remoteComplete. If it raises an exception, or returns the
+ # Failure that we gave it, our self.deferred will be errbacked. If
+ # it does not (either it ate the Failure or there the step finished
+ # normally and it didn't raise a new exception), self.deferred will
+ # be callbacked.
+ d = defer.maybeDeferred(self.remoteComplete, failure)
+ # arrange for the callback to get this RemoteCommand instance
+ # instead of just None
+ d.addCallback(lambda r: self)
+ d.addBoth(self.deferred.callback)
- def remoteComplete(self):
- # subclasses should interpret status as they like and do cleanup
- pass
+ def remoteComplete(self, maybeFailure):
+ """Subclasses can override this.
- def remoteFailed(self, why):
- # subclasses should do any cleanup (like closing log files) here
- pass
+ This is called when the RemoteCommand has finished. 'maybeFailure'
+ will be None if the command completed normally, or a Failure
+ instance in one of the following situations:
- def finished(self):
- self.active = False
- self.deferred.callback(self)
+ # the slave was lost before the command was started
+ # the slave didn't respond to the startCommand message
+ # the slave raised an exception while starting the command
+ # (bad command name, bad args, OSError from missing executable)
+ # the slave raised an exception while finishing the command
+ # (they send back a remote_complete message with a Failure payload)
+ # and also (for now):
+ # slave disconnected while the command was running
+
+ This method should do cleanup, like closing log files. It should
+ normally return the 'failure' argument, so that any exceptions will
+ be propagated to the Step. If it wants to consume them, return None
+ instead."""
- def failed(self, why):
- self.active = False
- self.deferred.errback(why)
+ return failure
class LoggedRemoteCommand(RemoteCommand):
"""This is a RemoteCommand which expects the slave to send back
@@ -203,15 +204,14 @@
log.msg("%s rc=%s" % (self, rc))
self.addHeader("program finished with exit code %d\n" % rc)
- def remoteComplete(self):
- if self.closeWhenFinished:
- log.msg("closing log")
- self.log.finish()
-
- def remoteFailed(self, why):
+ def remoteComplete(self, maybeFailure):
if self.closeWhenFinished:
- self.addHeader("\nremoteFailed: %s" % why)
+ if maybeFailure:
+ self.addHeader("\nremoteFailed: %s" % maybeFailure)
+ else:
+ log.msg("closing log")
self.log.finish()
+ return maybeFailure
class RemoteShellCommand(LoggedRemoteCommand):
"""This class helps you run a shell command on the build slave. It will
@@ -426,6 +426,15 @@
raise NotImplementedError("your subclass must implement this method")
+ def interrupt(self, reason):
+ """Halt the command, either because the user has decided to cancel
+ the build ('reason' is a string), or because the slave has
+ disconnected ('reason' is a ConnectionLost Failure). Any further
+ local processing should be skipped, and the Step completed with an
+ error status. The results text should say something useful like
+ ['step', 'interrupted'] or ['remote', 'lost']"""
+ pass
+
def finished(self, results):
if self.progress:
self.progress.finish()
@@ -563,9 +572,24 @@
self.cmd.useLog(loog, True)
loog.logProgressTo(self.progress, "output")
d = self.runCommand(self.cmd)
- d.addCallback(self._commandComplete)
+ d.addCallbacks(self._commandComplete, self.checkDisconnect)
d.addErrback(self.failed)
+ def interrupt(self, reason):
+ # TODO: consider adding an INTERRUPTED or STOPPED status to use
+ # instead of FAILURE, might make the text a bit more clear
+ self.addCompleteLog('interrupt', reason)
+ d = self.cmd.interrupt(reason)
+ return d
+
+ def checkDisconnect(self, f):
+ f.trap(error.ConnectionLost)
+ self.step_status.setColor("red")
+ self.step_status.setText(self.describe(True) +
+ ["failed", "slave", "lost"])
+ self.step_status.setText2(["failed", "slave", "lost"])
+ return self.finished(FAILURE)
+
def _commandComplete(self, cmd):
self.commandComplete(cmd)
self.createSummary(cmd.log)
@@ -1106,33 +1130,44 @@
@param timeout: the number of seconds to delay
"""
+ haltOnFailure = True
name = "dummy"
def __init__(self, timeout=5, **kwargs):
BuildStep.__init__(self, **kwargs)
self.timeout = timeout
+ self.timer = None
+
def start(self):
self.step_status.setColor("yellow")
self.step_status.setText(["delay", "%s secs" % self.timeout])
- reactor.callLater(self.timeout, self._done)
- def _done(self):
+ self.timer = reactor.callLater(self.timeout, self.done)
+
+ def interrupt(self, reason):
+ if self.timer:
+ self.timer.cancel()
+ self.timer = None
+ self.step_status.setColor("red")
+ self.step_status.setText(["delay", "interrupted"])
+ self.finished(FAILURE)
+
+ def done(self):
self.step_status.setColor("green")
self.finished(SUCCESS)
-class FailingDummy(BuildStep):
+class FailingDummy(Dummy):
"""I am a dummy step that raises an Exception after 5 seconds
@param timeout: the number of seconds to delay
"""
name = "failing dummy"
- def __init__(self, timeout=5, **kwargs):
- BuildStep.__init__(self, **kwargs)
- self.timeout = timeout
+
def start(self):
self.step_status.setColor("yellow")
self.step_status.setText(["boom", "%s secs" % self.timeout])
- reactor.callLater(self.timeout, self.boom)
- def boom(self):
+ self.timer = reactor.callLater(self.timeout, self.done)
+
+ def done(self):
class Boom(Exception):
pass
try:
@@ -1150,6 +1185,7 @@
"""
name = "remote dummy"
+
def __init__(self, timeout=5, **kwargs):
BuildStep.__init__(self, **kwargs)
args = {'timeout': timeout}
- Previous message (by thread): [Buildbot-commits] buildbot/buildbot interfaces.py,1.20,1.21 master.py,1.54,1.55
- Next message (by thread): [Buildbot-commits] buildbot/buildbot/slave interfaces.py,NONE,1.1 bot.py,1.3,1.4 commands.py,1.16,1.17
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the Commits
mailing list