From warner at users.sourceforge.net Fri Dec 3 22:54:54 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Fri, 03 Dec 2004 22:54:54 +0000 Subject: [Buildbot-commits] buildbot/buildbot/process base.py,1.43,1.44 builder.py,1.18,1.19 step.py,1.56,1.57 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/process In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot/process Modified Files: base.py builder.py step.py Log Message: Make commands (and builds) interruptible. Improve lost-slave behavior. Merging in several days of changes from local Arch branch, see ChangeLog for details about individual files. Index: base.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/process/base.py,v retrieving revision 1.43 retrieving revision 1.44 diff -u -d -r1.43 -r1.44 --- base.py 30 Sep 2004 07:15:35 -0000 1.43 +++ base.py 3 Dec 2004 22:54:51 -0000 1.44 @@ -5,7 +5,7 @@ from twisted.python import log, components from twisted.python.failure import Failure -from twisted.internet import reactor, defer +from twisted.internet import reactor, defer, error from twisted.spread import pb import twisted.web.util @@ -221,8 +221,10 @@ first Step. It returns a Deferred which will fire when the build finishes.""" + log.msg("%s.startBuild" % self) self.build_status = build_status self.remote = remote + self.remote.notifyOnDisconnect(self.lostRemote) self.deferred = defer.Deferred() try: @@ -354,6 +356,8 @@ self.results.append(result) if text: self.text.extend(text) + if not self.remote: + terminate = True if result == FAILURE: if step.warnOnFailure: if self.result != FAILURE: @@ -371,22 +375,38 @@ self.result = FAILURE return terminate + def lostRemote(self, remote): + # the slave went away. There are several possible reasons for this, + # and they aren't necessarily fatal. For now, kill the build, but + # TODO: see if we can resume the build when it reconnects. + log.msg("%s.lostRemote" % self) + self.remote = None + if self.currentStep: + # this should cause the step to finish. + log.msg(" stopping currentStep", self.currentStep) + self.currentStep.interrupt(Failure(error.ConnectionLost())) + def stopBuild(self, reason): - # the idea here is to let the user cancel a build because, e.g., they - # realized they committed a bug and they don't want to waste the time - # building something that they know will fail. Another reason might - # be to abandon a stuck build. We want to mark the build as failed - # quickly rather than waiting for it to die on its own. + # the idea here is to let the user cancel a build because, e.g., + # they realized they committed a bug and they don't want to waste + # the time building something that they know will fail. Another + # reason might be to abandon a stuck build. We want to mark the + # build as failed quickly rather than waiting for the slave's + # timeout to kill it on its own. log.msg(" %s: stopping build: %s" % (self, reason)) - assert not self.finished - #self.currentStep.stop(reason) - # TODO: maybe let its deferred do buildFinished - if self.currentStep and self.currentStep.progress: - # XXX: really .fail or something - self.currentStep.progress.finish() - text = ["stopped", reason] - self.buildFinished(text, "red", FAILURE) + if self.finished: + return + # TODO: include 'reason' in this point event + self.builder.builder_status.addPointEvent(['interrupt']) + self.currentStep.interrupt(reason) + if 0: + # TODO: maybe let its deferred do buildFinished + if self.currentStep and self.currentStep.progress: + # XXX: really .fail or something + self.currentStep.progress.finish() + text = ["stopped", reason] + self.buildFinished(text, "red", FAILURE) def allStepsDone(self): if self.result == FAILURE: @@ -419,6 +439,8 @@ abandoned.""" self.finished = True + if self.remote: + self.remote.dontNotifyOnDisconnect(self.lostRemote) self.results = results log.msg(" %s: build finished" % self) @@ -439,5 +461,9 @@ class BuildControl(components.Adapter): __implements__ = interfaces.IBuildControl, + def getStatus(self): return self.original.build_status + + def stopBuild(self, reason=""): + self.original.stopBuild(reason) Index: builder.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/process/builder.py,v retrieving revision 1.18 retrieving revision 1.19 diff -u -d -r1.18 -r1.19 --- builder.py 7 Nov 2004 20:32:47 -0000 1.18 +++ builder.py 3 Dec 2004 22:54:51 -0000 1.19 @@ -1,8 +1,8 @@ #! /usr/bin/python -from twisted.python import log, components +from twisted.python import log, components, failure from twisted.spread import pb -from twisted.internet import reactor +from twisted.internet import reactor, defer from buildbot import interfaces from buildbot.status.progress import Expectations @@ -172,6 +172,7 @@ def detached(self): """This is called when the connection to the bot is lost.""" + log.msg("%s.detached" % self) self.remote = None reactor.callLater(0, self._detached) # the current step will be stopped (via a notifyOnDisconnect @@ -179,6 +180,7 @@ def _detached(self): if self.currentBuild: + log.msg("%s._detached: killing build" % self) # wasn't enough self.currentBuild.stopBuild("slave lost") self.currentBuild = None @@ -298,7 +300,8 @@ def startBuild(self, build): log.msg("starting build %s" % build) - self.remote.callRemote("startBuild") # informational courtesy + d = self.remote.callRemote("startBuild") # informational courtesy + d.addErrback(self._startBuildFailed, build) # create the BuildStatus object that goes with the Build bs = self.builder_status.newBuild() @@ -309,9 +312,14 @@ # Finally it will start the actual build process. d = build.startBuild(bs, self.expectations, self.remote) d.addCallback(self.buildFinished) + d.addErrback(log.err) control = base.BuildControl(build) return control + def _startBuildFailed(self, why, build): + log.msg("wanted to start build %s, but " + "remote_startBuild failed: %s" % (build, why)) + def testsFinished(self, results): # XXX: add build number, datestamp, Change information #self.testTracker.testsFinished(results) @@ -365,6 +373,50 @@ self.remote.callRemote("shutdown") +class Ping: + def ping(self, status, remote, timeout): + if not remote: + status.addPointEvent(["ping", "no slave"], "red") + return defer.succeed(False) # interfaces.NoSlaveError + self.event = status.addEvent(["pinging"], "yellow") + self.active = True + self.d = defer.Deferred() + d = remote.callRemote("print", "ping") + d.addBoth(self._pong) + + # We use either our own timeout or the (long) TCP timeout to detect + # silently-missing slaves. This might happen because of a NAT + # timeout or a routing loop. If the slave just shuts down (and we + # somehow missed the FIN), we should get a "connection refused" + # message. + self.timer = reactor.callLater(timeout, self.timeout) + return self.d + + def timeout(self): + self.timer = None + self._pong(failure.Failure(interfaces.NoSlaveError("timeout"))) + + def _pong(self, res): + if not self.active: + return + self.active = False + if self.timer: + self.timer.cancel() + e = self.event + if isinstance(res, failure.Failure): + e.text = ["ping", "failed"] + e.color = "red" + ponged = False + # TODO: force the BotPerspective to disconnect, since this + # indicates that the bot is unreachable. That will also append a + # "disconnect" event to the builder_status, terminating this + # "ping failed" event. + else: + e.text = ["ping", "success"] + e.color = "green" + ponged = True + e.finish() + self.d.callback(ponged) class BuilderControl(components.Adapter): __implements__ = interfaces.IBuilderControl, @@ -372,39 +424,15 @@ bc = self.original.forceBuild(who, reason) return bc - def ping(self, wait=False): - status = self.original.builder_status - if not self.original.remote: - status.addPointEvent(["ping", "no slave"], "red") - if wait: - return defer.fail(interfaces.NoSlaveError()) - else: - # we rely upon the TCP timeout to detect silently-missing - # slaves. This might happen because of a NAT timeout or a - # routing loop. If the slave just shuts down, we should get a - # "connection refused" message. Of course, in that case we - # should have gotten one for the connection anyway, but - # sometimes things get lost. - e = status.addEvent(["pinging"], "yellow") - d = self.original.remote.callRemote("print", "ping") - d.addCallback(self._pong, e) - d.addErrback(self._pong_failed, e, wait) - if wait: - return d - - def _pong(self, res, e): - e.text = ["ping", "success"] - e.color = "green" - e.finish() + def getBuild(self, number): + b = self.original.currentBuild + if b and b.build_status.number == number: + return base.BuildControl(b) + return None - def _pong_failed(self, why, e, wait): - e.text = ["ping", "failed"] - e.color = "red" - e.finish() - # TODO: force the BotPerspective to disconnect, since this indicates - # that the bot is unreachable. That will also append a "disconnect" - # event to the builder_status, terminating this "ping failed" event. - if wait: - raise interfaces.NoSlaveError() + def ping(self, timeout=30): + d = Ping().ping(self.original.builder_status, + self.original.remote, timeout) + return d components.registerAdapter(BuilderControl, Builder, interfaces.IBuilderControl) Index: step.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/process/step.py,v retrieving revision 1.56 retrieving revision 1.57 diff -u -d -r1.56 -r1.57 --- step.py 28 Oct 2004 07:25:30 -0000 1.56 +++ step.py 3 Dec 2004 22:54:51 -0000 1.57 @@ -27,8 +27,7 @@ SlaveCommands registered in the buildslave, and self.args to a dictionary of arguments that will be passed to the SlaveCommand instance. - start, remoteUpdate, remoteComplete, and remoteFailed are available to be - overridden + start, remoteUpdate, and remoteComplete are available to be overridden """ @@ -41,6 +40,14 @@ self.remote_command = remote_command self.args = args + def __getstate__(self): + dict = self.__dict__.copy() + # Remove the remote ref: if necessary (only for resumed builds), it + # will be reattached at resume time + if dict.has_key("remote"): + del dict["remote"] + return dict + def run(self, step, remote): self.active = True self.step = step @@ -51,40 +58,40 @@ self.commandID = "%d" % c log.msg("%s: RemoteCommand.run [%s]" % (self, self.commandID)) self.deferred = defer.Deferred() + d = defer.maybeDeferred(self.start) - d.addErrback(self._remoteFailed) # will catch unknown commands - return self.deferred - def __getstate__(self): - dict = self.__dict__.copy() - # Remove the remote ref: if necessary (only for resumed builds), it - # will be reattached at resume time - if dict.has_key("remote"): - del dict["remote"] - return dict + # _finished is called with an error for unknown commands, errors + # that occur while the command is starting (including OSErrors in + # exec()), StaleBroker (when the connection was lost before we + # started), and pb.PBConnectionLost (when the slave isn't responding + # over this connection, perhaps it had a power failure, or NAT + # weirdness). If this happens, self.deferred is fired right away. + d.addErrback(self._finished) + + # Connections which are lost while the command is running are caught + # when our parent Step calls our .lostRemote() method. + return self.deferred def start(self): # We will receive remote_update messages as the command runs. # We will get a single remote_complete when it finishes. # We should fire self.deferred when the command is done. - self.remote.notifyOnDisconnect(self.disconnect) d = self.remote.callRemote("startCommand", self, self.commandID, self.remote_command, self.args) return d - def disconnect(self, broker): - # lost the slave: fail the command - log.msg("RemoteCommand.disconnect: lost slave", self) - self.active = False - self._remoteFailed(Failure(error.ConnectionLost())) + def interrupt(self, why): + if isinstance(why, Failure) and why.check(error.ConnectionLost): + log.msg("RemoteCommand.disconnect: lost slave", self) + self.remote = None + self._finished(Failure(error.ConnectionLost())) + return - def todo_stop(self): # tell the remote command to halt. Returns a Deferred that will fire - # when the command has been stopped, or will errback if the slave is - # unreachable. + # when the interrupt command has been delivered. d = defer.maybeDeferred(self.remote.callRemote, - "stopCommand", self, self.commandID, - reason) + "interruptCommand", self.commandID, why) return d def remote_update(self, updates): @@ -96,7 +103,7 @@ self.remoteUpdate(update) except: # log failure, terminate build, let slave retire the update - self.failed(Failure()) + self._finished(Failure()) # TODO: what if multiple updates arrive? should # skip the rest but ack them all if num > max_updatenum: @@ -109,51 +116,45 @@ def remote_complete(self, failure=None): # call the real remoteComplete a moment later, but first return an # acknowledgement so the slave can retire the completion message. - self.remote.dontNotifyOnDisconnect(self.disconnect) if self.active: - reactor.callLater(0, self._remoteComplete, failure) + reactor.callLater(0, self._finished, failure) return None - def _remoteComplete(self, failure): - if failure: - return self._remoteFailed(failure) - try: - self.remoteComplete() - except: - # log the failure and terminate the step - log.msg("remoteComplete had exception") - return self.failed(Failure()) - self.finished() - - def _remoteFailed(self, failure): - log.msg("RemoteCommand._remoteFailed") - try: - self.remote.dontNotifyOnDisconnect(self.disconnect) - except ValueError: - # TODO: make this cleaner but keep it safe - pass # probably already removed it in remote_complete. - try: - self.remoteFailed(failure) - except: - log.msg("RemoteCommand.remoteFailed failed") - log.err() - return self.failed(failure) + def _finished(self, failure=None): + self.active = False + # call .remoteComplete. If it raises an exception, or returns the + # Failure that we gave it, our self.deferred will be errbacked. If + # it does not (either it ate the Failure or there the step finished + # normally and it didn't raise a new exception), self.deferred will + # be callbacked. + d = defer.maybeDeferred(self.remoteComplete, failure) + # arrange for the callback to get this RemoteCommand instance + # instead of just None + d.addCallback(lambda r: self) + d.addBoth(self.deferred.callback) - def remoteComplete(self): - # subclasses should interpret status as they like and do cleanup - pass + def remoteComplete(self, maybeFailure): + """Subclasses can override this. - def remoteFailed(self, why): - # subclasses should do any cleanup (like closing log files) here - pass + This is called when the RemoteCommand has finished. 'maybeFailure' + will be None if the command completed normally, or a Failure + instance in one of the following situations: - def finished(self): - self.active = False - self.deferred.callback(self) + # the slave was lost before the command was started + # the slave didn't respond to the startCommand message + # the slave raised an exception while starting the command + # (bad command name, bad args, OSError from missing executable) + # the slave raised an exception while finishing the command + # (they send back a remote_complete message with a Failure payload) + # and also (for now): + # slave disconnected while the command was running + + This method should do cleanup, like closing log files. It should + normally return the 'failure' argument, so that any exceptions will + be propagated to the Step. If it wants to consume them, return None + instead.""" - def failed(self, why): - self.active = False - self.deferred.errback(why) + return failure class LoggedRemoteCommand(RemoteCommand): """This is a RemoteCommand which expects the slave to send back @@ -203,15 +204,14 @@ log.msg("%s rc=%s" % (self, rc)) self.addHeader("program finished with exit code %d\n" % rc) - def remoteComplete(self): - if self.closeWhenFinished: - log.msg("closing log") - self.log.finish() - - def remoteFailed(self, why): + def remoteComplete(self, maybeFailure): if self.closeWhenFinished: - self.addHeader("\nremoteFailed: %s" % why) + if maybeFailure: + self.addHeader("\nremoteFailed: %s" % maybeFailure) + else: + log.msg("closing log") self.log.finish() + return maybeFailure class RemoteShellCommand(LoggedRemoteCommand): """This class helps you run a shell command on the build slave. It will @@ -426,6 +426,15 @@ raise NotImplementedError("your subclass must implement this method") + def interrupt(self, reason): + """Halt the command, either because the user has decided to cancel + the build ('reason' is a string), or because the slave has + disconnected ('reason' is a ConnectionLost Failure). Any further + local processing should be skipped, and the Step completed with an + error status. The results text should say something useful like + ['step', 'interrupted'] or ['remote', 'lost']""" + pass + def finished(self, results): if self.progress: self.progress.finish() @@ -563,9 +572,24 @@ self.cmd.useLog(loog, True) loog.logProgressTo(self.progress, "output") d = self.runCommand(self.cmd) - d.addCallback(self._commandComplete) + d.addCallbacks(self._commandComplete, self.checkDisconnect) d.addErrback(self.failed) + def interrupt(self, reason): + # TODO: consider adding an INTERRUPTED or STOPPED status to use + # instead of FAILURE, might make the text a bit more clear + self.addCompleteLog('interrupt', reason) + d = self.cmd.interrupt(reason) + return d + + def checkDisconnect(self, f): + f.trap(error.ConnectionLost) + self.step_status.setColor("red") + self.step_status.setText(self.describe(True) + + ["failed", "slave", "lost"]) + self.step_status.setText2(["failed", "slave", "lost"]) + return self.finished(FAILURE) + def _commandComplete(self, cmd): self.commandComplete(cmd) self.createSummary(cmd.log) @@ -1106,33 +1130,44 @@ @param timeout: the number of seconds to delay """ + haltOnFailure = True name = "dummy" def __init__(self, timeout=5, **kwargs): BuildStep.__init__(self, **kwargs) self.timeout = timeout + self.timer = None + def start(self): self.step_status.setColor("yellow") self.step_status.setText(["delay", "%s secs" % self.timeout]) - reactor.callLater(self.timeout, self._done) - def _done(self): + self.timer = reactor.callLater(self.timeout, self.done) + + def interrupt(self, reason): + if self.timer: + self.timer.cancel() + self.timer = None + self.step_status.setColor("red") + self.step_status.setText(["delay", "interrupted"]) + self.finished(FAILURE) + + def done(self): self.step_status.setColor("green") self.finished(SUCCESS) -class FailingDummy(BuildStep): +class FailingDummy(Dummy): """I am a dummy step that raises an Exception after 5 seconds @param timeout: the number of seconds to delay """ name = "failing dummy" - def __init__(self, timeout=5, **kwargs): - BuildStep.__init__(self, **kwargs) - self.timeout = timeout + def start(self): self.step_status.setColor("yellow") self.step_status.setText(["boom", "%s secs" % self.timeout]) - reactor.callLater(self.timeout, self.boom) - def boom(self): + self.timer = reactor.callLater(self.timeout, self.done) + + def done(self): class Boom(Exception): pass try: @@ -1150,6 +1185,7 @@ """ name = "remote dummy" + def __init__(self, timeout=5, **kwargs): BuildStep.__init__(self, **kwargs) args = {'timeout': timeout} From warner at users.sourceforge.net Fri Dec 3 22:54:55 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Fri, 03 Dec 2004 22:54:55 +0000 Subject: [Buildbot-commits] buildbot/buildbot/status builder.py,1.45,1.46 html.py,1.46,1.47 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/status In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot/status Modified Files: builder.py html.py Log Message: Make commands (and builds) interruptible. Improve lost-slave behavior. Merging in several days of changes from local Arch branch, see ChangeLog for details about individual files. Index: builder.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/status/builder.py,v retrieving revision 1.45 retrieving revision 1.46 diff -u -d -r1.45 -r1.46 --- builder.py 24 Nov 2004 02:41:22 -0000 1.45 +++ builder.py 3 Dec 2004 22:54:52 -0000 1.46 @@ -340,8 +340,11 @@ return (self.finished is not None) def waitUntilFinished(self): - d = defer.Deferred() - self.finishedWatchers.append(d) + if self.finished: + d = defer.succeed(self) + else: + d = defer.Deferred() + self.finishedWatchers.append(d) return d # while the step is running, the following methods make sense. @@ -580,8 +583,11 @@ return (self.finished is not None) def waitUntilFinished(self): - d = defer.Deferred() - self.finishedWatchers.append(d) + if self.finished: + d = defer.succeed(self) + else: + d = defer.Deferred() + self.finishedWatchers.append(d) return d # while the build is running, the following methods make sense. @@ -1365,7 +1371,7 @@ return self.botmaster.builders[name].builder_status def getSlave(self, slavename): - return self.botmaster.slaveStatus[slavename] + return self.botmaster.slaves[slavename].slave_status def subscribe(self, target): self.watchers.append(target) Index: html.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/status/html.py,v retrieving revision 1.46 retrieving revision 1.47 diff -u -d -r1.46 -r1.47 --- html.py 24 Nov 2004 03:35:05 -0000 1.46 +++ html.py 3 Dec 2004 22:54:52 -0000 1.47 @@ -5,10 +5,11 @@ from twisted.python import log, components import urllib +from twisted.internet import defer, reactor from twisted.web.resource import Resource from twisted.web import static, html, server, distrib from twisted.web.error import NoResource -from twisted.web.util import Redirect +from twisted.web.util import Redirect, DeferredResource from twisted.application import service, internet from twisted.spread import pb @@ -35,6 +36,23 @@ class IHTMLLog(components.Interface): pass +ROW_TEMPLATE = ''' +
+ %(label)s + %(field)s +
''' + +def make_row(label, field): + """Create a name/value row for the HTML. + + `label` is plain text; it will be HTML-encoded. + + `field` is a bit of HTML structure; it will not be encoded in + any way. + """ + label = html.escape(label) + return ROW_TEMPLATE % {"label": label, "field": field} + colormap = { 'green': '#72ff75', } @@ -227,9 +245,10 @@ class StatusResourceBuild(HtmlResource): title = "Build" - def __init__(self, build): + def __init__(self, build, control): HtmlResource.__init__(self) self.build = build + self.control = control def body(self, request): b = self.build @@ -246,6 +265,19 @@ data += "

test results

\n" % url else: data += "

Build In Progress

" + if self.control is not None: + stopURL = urllib.quote(request.childLink("stop")) + data += """ +
+

To stop this build, fill out the following fields and + push the 'Stop' button

\n""" % stopURL + data += make_row("Your name:", + "") + data += make_row("Reason for stopping build:", + "") + data += """ +
+ """ data += ("

Blamelist:

\n" "
    \n") @@ -262,10 +294,31 @@ #data += html.PRE(b.changesText()) # TODO return data + def stop(self, request): + log.msg("web stopBuild of build %s:%s" % \ + (self.build.getBuilder().getName(), + self.build.getNumber())) + name = request.args.get("username", [""])[0] + comments = request.args.get("comments", [""])[0] + reason = ("The web-page 'stop build' button was pressed by " + "'%s': %s\n" % (name, comments)) + self.control.stopBuild(reason) + # we're at http://localhost:8080/svn-hello/builds/5/stop?[args] and + # we want to go to: http://localhost:8080/svn-hello/builds/5 or + # http://localhost:8080/ + # + #return Redirect("../%d" % self.build.getNumber()) + r = Redirect("../../..") + d = defer.Deferred() + reactor.callLater(1, d.callback, r) + return DeferredResource(d) + def getChild(self, path, request): if path == "tests": # TODO: this will collide with a step named 'tests' return StatusResourceTestResults(self.build.getTestResults()) + if path == "stop": + return self.stop(request) stepname = path steps = self.build.getSteps() for s in steps: @@ -285,17 +338,14 @@ def body(self, request): b = self.builder slave = b.getSlave() - data = self.make_row("Builder:", - html.escape(b.getName())) + data = make_row("Builder:", html.escape(b.getName())) b1 = b.getBuild(-1) if b1 is not None: - data += self.make_row("Current/last build:", - str(b1.getNumber())) + data += make_row("Current/last build:", str(b1.getNumber())) if slave.isConnected(): data += "\nCONNECTED (slave '%s')
    \n" % slave.getName() if slave.getAdmin(): - data += self.make_row("Admin:", - html.escape(slave.getAdmin())) + data += make_row("Admin:", html.escape(slave.getAdmin())) if slave.getHost(): data += "Host info:\n" data += html.PRE(slave.getHost()) @@ -309,10 +359,10 @@

    To force a build, fill out the following fields and push the 'Force Build' button

    """ - + self.make_row("Your name:", - "") - + self.make_row("Reason for build:", - "") + + make_row("Your name:", + "") + + make_row("Reason for build:", + "") + """
    @@ -334,23 +384,6 @@ return data - def make_row(self, label, field): - """Create a name/value row for the HTML. - - `label` is plain text; it will be HTML-encoded. - - `field` is a bit of HTML structure; it will not be encoded in - any way. - """ - label = html.escape(label) - return self.ROW_TEMPLATE % {"label": label, "field": field} - - ROW_TEMPLATE = ''' -
    - %(label)s - %(field)s -
    ''' - def force(self, request): name = request.args.get("username", [""])[0] reason = request.args.get("comments", [""])[0] @@ -378,6 +411,8 @@ return Redirect("..") def getChild(self, path, request): + log.msg('path=%s, postpath=%s, prepath=%s' % (path, request.postpath, + request.prepath)) if path == "force": return self.force(request) if path == "ping": @@ -407,7 +442,10 @@ if path == "builds": build = self.builder.getBuild(num) if build: - return StatusResourceBuild(build) + control = None + if self.control: + control = self.control.getBuild(num) + return StatusResourceBuild(build, control) else: return NoResource("No such build '%d'" % num) return NoResource("really weird URL %s" % path) From warner at users.sourceforge.net Fri Dec 3 22:54:55 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Fri, 03 Dec 2004 22:54:55 +0000 Subject: [Buildbot-commits] buildbot/buildbot/test test_run.py,1.18,1.19 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/test In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot/test Modified Files: test_run.py Log Message: Make commands (and builds) interruptible. Improve lost-slave behavior. Merging in several days of changes from local Arch branch, see ChangeLog for details about individual files. Index: test_run.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/test/test_run.py,v retrieving revision 1.18 retrieving revision 1.19 diff -u -d -r1.18 -r1.19 --- test_run.py 30 Sep 2004 07:13:32 -0000 1.18 +++ test_run.py 3 Dec 2004 22:54:53 -0000 1.19 @@ -4,7 +4,7 @@ dr = unittest.deferredResult from twisted.internet import reactor, defer from twisted.python import log -import sys, os, shutil +import sys, os, shutil, time #log.startLogging(sys.stderr) from buildbot import master, interfaces @@ -44,6 +44,12 @@ BuildmasterConfig = c """ +class MyBot(bot.Bot): + def remote_getSlaveInfo(self): + return self.parent.info +class MyBuildSlave(bot.BuildSlave): + botClass = MyBot + class STarget: __implements__ = interfaces.IStatusReceiver, debug = False @@ -119,33 +125,97 @@ # now kill the timer b1.waiting.stopTimer() -class Status(unittest.TestCase): +class RunMixin: master = None slave = None + slave2 = None def setUp(self): shutil.rmtree("basedir", ignore_errors=1) + shutil.rmtree("slavebase", ignore_errors=1) + shutil.rmtree("slavebase2", ignore_errors=1) os.mkdir("basedir") self.master = master.BuildMaster("basedir") def connectSlave(self): port = self.master.slavePort._port.getHost().port os.mkdir("slavebase") - slave = bot.BuildSlave("localhost", port, "bot1", "sekrit", - "slavebase", keepalive=0, usePTY=1) + slave = MyBuildSlave("localhost", port, "bot1", "sekrit", + "slavebase", keepalive=0, usePTY=1) + slave.info = {"admin": "one"} self.slave = slave slave.startService() d = self.master.botmaster.waitUntilBuilderAttached("dummy") dr(d) + def connectSlave2(self): + port = self.master.slavePort._port.getHost().port + os.mkdir("slavebase2") + slave = MyBuildSlave("localhost", port, "bot1", "sekrit", + "slavebase2", keepalive=0, usePTY=1) + slave.info = {"admin": "two"} + self.slave2 = slave + slave.startService() + def tearDown(self): + log.msg("doing tearDown") + self.shutdownSlave() + if self.master: + dr(defer.maybeDeferred(self.master.stopService)) + self.master = None + + # various forms of slave death + + def shutdownSlave(self, waitForMasterToo=True): + # the slave has disconnected normally: they SIGINT'ed it, or it shut + # down willingly. This will kill child processes and give them a + # chance to finish up. if self.slave: - d = self.master.botmaster.waitUntilBuilderDetached("dummy") + d = self.slave.waitUntilDisconnected() dr(defer.maybeDeferred(self.slave.stopService)) dr(d) - if self.master: - dr(defer.maybeDeferred(self.master.stopService)) - + self.slave = None + if self.slave2: + d = self.slave2.waitUntilDisconnected() + dr(defer.maybeDeferred(self.slave2.stopService)) + dr(d) + self.slave2 = None + if waitForMasterToo: + d = self.master.botmaster.waitUntilBuilderDetached("dummy") + dr(d) + + def killSlave(self): + # the slave has died, its host sent a FIN. The .notifyOnDisconnect + # callbacks will terminate the current step, so the build should be + # flunked (no further steps should be started). + self.slave.bf.continueTrying = 0 + bot = self.slave.getServiceNamed("bot") + broker = bot.builders["dummy"].remote.broker + broker.transport.loseConnection() + self.slave = None + + def disappearSlave(self): + # the slave's host has vanished off the net, leaving the connection + # dangling. This will be detected quickly by app-level keepalives or + # a ping, or slowly by TCP timeouts. + + # implement this by replacing the slave Broker's .dataReceived method + # with one that just throws away all data. + def discard(data): + pass + bot = self.slave.getServiceNamed("bot") + broker = bot.builders["dummy"].remote.broker + broker.dataReceived = discard # seal its ears + broker.transport.write = discard # and take away its voice + + def ghostSlave(self): + # the slave thinks it has lost the connection, and initiated a + # reconnect. The master doesn't yet realize it has lost the previous + # connection, and sees two connections at once. + raise NotImplementedError + +class Status(RunMixin, unittest.TestCase): + def testSlave(self): m = self.master s = m.getStatus() @@ -269,3 +339,166 @@ res = dr(d) self.failUnless(3.0 < t4.eta_build < 5.0) # should be 4 seconds + +class Disconnect(RunMixin, unittest.TestCase): + + def disconnectSetup(self): + # verify that disconnecting the slave during a build properly + # terminates the build + m = self.master + s = m.getStatus() + c = interfaces.IControl(m) + + m.loadConfig(config_2) + m.readConfig = True + m.startService() + + self.failUnlessEqual(s.getBuilderNames(), ["dummy"]) + s1 = s.getBuilder("dummy") + self.failUnlessEqual(s1.getName(), "dummy") + self.failUnlessEqual(s1.getState(), ("offline", None, None)) + self.failUnlessEqual(s1.getCurrentBuild(), None) + self.failUnlessEqual(s1.getLastFinishedBuild(), None) + self.failUnlessEqual(s1.getBuild(-1), None) + + self.connectSlave() + self.failUnlessEqual(s1.getState(), ("idle", None, None)) + return m,s,c,s1 + + def verifyDisconnect(self, bs): + self.failUnless(bs.isFinished()) + + step1 = bs.getSteps()[0] + self.failUnlessEqual(step1.getText(), ["delay", "interrupted"]) + self.failUnlessEqual(step1.getResults()[0], builder.FAILURE) + + self.failUnlessEqual(bs.getResults(), builder.FAILURE) + + + def testIdle1(self): + m,s,c,s1 = self.disconnectSetup() + # disconnect the slave before the build starts + self.shutdownSlave() # dies before it gets started + + # trying to force a build now will cause an error. Regular builds + # just wait for the slave to re-appear, but forced builds that + # cannot be run right away trigger NoSlaveErrors + fb = c.getBuilder("dummy").forceBuild + self.failUnlessRaises(interfaces.NoSlaveError, + fb, None, "forced build") + + def testIdle2(self): + m,s,c,s1 = self.disconnectSetup() + # now suppose the slave goes missing + self.disappearSlave() + + # forcing a build will work: the build will begin, since we think we + # have a slave. The build will fail, however, because of a timeout + # error. + bc = c.getBuilder("dummy").forceBuild(None, "forced build") + bs = bc.getStatus() + print "build started" + d = bs.waitUntilFinished() + dr(d, 5) + print bs.getText() + testIdle2.skip = "short timeout not yet implemented" + + def testBuild1(self): + m,s,c,s1 = self.disconnectSetup() + # this next sequence is timing-dependent. The dummy build takes at + # least 3 seconds to complete, and this batch of commands must + # complete within that time. + # + bc = c.getBuilder("dummy").forceBuild(None, "forced build") + bs = bc.getStatus() + # kill the slave while it's running the first step + self.shutdownSlave() # dies before it gets started + + # now examine the just-stopped build and make sure it is really + # stopped. This is checking for bugs in which the slave-detach gets + # missed or causes an exception which prevents the build from being + # marked as "finished due to an error". + d = bs.waitUntilFinished() + dr(d, 5) + + self.failUnlessEqual(s1.getState()[0], "offline") + self.verifyDisconnect(bs) + + def testBuild2(self): + m,s,c,s1 = self.disconnectSetup() + # this next sequence is timing-dependent + bc = c.getBuilder("dummy").forceBuild(None, "forced build") + bs = bc.getStatus() + # shutdown the slave while it's running the first step + reactor.callLater(0.5, self.shutdownSlave) + + dr(bs.waitUntilFinished(), 5) + + self.failUnlessEqual(s1.getState()[0], "offline") + self.verifyDisconnect(bs) + + def testBuild3(self): + m,s,c,s1 = self.disconnectSetup() + # this next sequence is timing-dependent + bc = c.getBuilder("dummy").forceBuild(None, "forced build") + bs = bc.getStatus() + # kill the slave while it's running the first step + reactor.callLater(0.5, self.killSlave) + + dr(bs.waitUntilFinished(), 5) + + self.failUnlessEqual(s1.getState()[0], "offline") + self.verifyDisconnect(bs) + + def testInterrupt(self): + m,s,c,s1 = self.disconnectSetup() + # this next sequence is timing-dependent + bc = c.getBuilder("dummy").forceBuild(None, "forced build") + bs = bc.getStatus() + # halt the build while it's running the first step + reactor.callLater(0.5, bc.stopBuild, "bang go splat") + + dr(bs.waitUntilFinished(), 5) + + self.verifyDisconnect(bs) + + def testDisappear(self): + m,s,c,s1 = self.disconnectSetup() + bc = c.getBuilder("dummy") + + # ping should succeed + d = bc.ping(1) + res = dr(d) + self.failUnlessEqual(res, True) + + # now, before any build is run, make the slave disappear + self.slave.bf.continueTrying = 0 + self.disappearSlave() + + # at this point, a ping to the slave should timeout + d = bc.ping(1) + res = dr(d) + self.failUnlessEqual(res, False) + + def testDuplicate(self): + m,s,c,s1 = self.disconnectSetup() + bc = c.getBuilder("dummy") + bs = s.getBuilder("dummy") + ss = bs.getSlave() + + self.failUnless(ss.isConnected()) + self.failUnlessEqual(ss.getAdmin(), "one") + + # now, before any build is run, make the first slave disappear + self.slave.bf.continueTrying = 0 + self.disappearSlave() + + d = self.master.botmaster.waitUntilBuilderDetached("dummy") + # now let the new slave take over + self.connectSlave2() + dr(d, 2) + d = self.master.botmaster.waitUntilBuilderAttached("dummy") + dr(d, 2) + + self.failUnless(ss.isConnected()) + self.failUnlessEqual(ss.getAdmin(), "two") From warner at users.sourceforge.net Fri Dec 3 22:54:54 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Fri, 03 Dec 2004 22:54:54 +0000 Subject: [Buildbot-commits] buildbot/buildbot/slave interfaces.py,NONE,1.1 bot.py,1.3,1.4 commands.py,1.16,1.17 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/slave In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot/slave Modified Files: bot.py commands.py Added Files: interfaces.py Log Message: Make commands (and builds) interruptible. Improve lost-slave behavior. Merging in several days of changes from local Arch branch, see ChangeLog for details about individual files. Index: bot.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/bot.py,v retrieving revision 1.3 retrieving revision 1.4 diff -u -d -r1.3 -r1.4 --- bot.py 30 Aug 2004 22:15:23 -0000 1.3 +++ bot.py 3 Dec 2004 22:54:52 -0000 1.4 @@ -35,21 +35,20 @@ def __init__(self, builder): self.builder = builder -class SlaveBuilder(pb.Referenceable): +class SlaveBuilder(pb.Referenceable, service.Service): """This is the local representation of a single Builder: it handles a single kind of build (like an all-warnings build). It has a name and a home directory. The rest of its behavior is determined by the master. """ - + + stopCommandOnShutdown = True + def __init__(self, parent, name, builddir, not_really): + #service.Service.__init__(self) self.name = name - self.bot = parent self.builddir = builddir self.not_really = not_really - self.basedir = os.path.join(self.bot.basedir, builddir) - if not os.path.isdir(self.basedir): - os.mkdir(self.basedir) self.remote = None # remote is a ref to the Builder object on the master side, and is # set when they attach. It really isn't used very much @@ -74,6 +73,20 @@ def __repr__(self): return "" % self.name + + def setServiceParent(self, parent): + service.Service.setServiceParent(self, parent) + self.bot = self.parent + self.basedir = os.path.join(self.bot.basedir, self.builddir) + if not os.path.isdir(self.basedir): + os.mkdir(self.basedir) + + def stopService(self): + service.Service.stopService(self) + if self.command and self.stopCommandOnShutdown: + self.stopCommand() + self.command = None + def remote_setMaster(self, remote): self.remote = remote self.remote.notifyOnDisconnect(self.lostRemote) @@ -82,14 +95,13 @@ def lostRemote(self, remote): log.msg("lost remote") - if self.remote != remote: - print "WEIRD: lost the wrong remote" self.remote = None + def lostRemoteStep(self, remotestep): log.msg("lost remote step") - if self.remoteStep != remotestep: - print "WEIRD: lost the wrong remote step" self.remoteStep = None + if self.command and self.stopCommandOnShutdown: + self.stopCommand() # the following are Commands that can be invoked by the master-side # Builder @@ -105,7 +117,7 @@ if self.command: log.msg("leftover command, dropping it") - #self.stopCommand() + self.stopCommand() self.command = None try: @@ -120,25 +132,49 @@ self.remoteStep.notifyOnDisconnect(self.lostRemoteStep) self.updateNum = 0 self.complete = None + self.command.running = True d = defer.maybeDeferred(self.command.start) d.addCallbacks(self.commandComplete, self.commandFailed) return None - # the following are invoked by the Commands we spawn + def remote_interruptCommand(self, stepId, why): + """Halt the current step.""" + log.msg("asked to interrupt current command: %s" % why) + if not self.command: + # TODO: just log it, a race could result in their interrupting a + # command that wasn't actually running + log.msg(" .. but none was running") + return + self.command.interrupt() + + def stopCommand(self): + if not self.command: + return + self.command.running = False + if not self.command.interrupted: + self.command.interrupt() + + + # these two are fired by the Deferred attached to each Command def commandComplete(self, dummy): + if not self.running: + return self.sendComplete() def commandFailed(self, why): + if not self.running: + return log.msg("commandFailed") log.err(why) self.sendComplete(why) - - # these are utility routines used by sendStatus and commandComplete + # sendUpdate is invoked by the Commands we spawn def sendUpdate(self, data=None): """This sends the status update to the master-side BuildStep object, giving it a sequence number in the process. It adds the update to a queue, and asks the master to acknowledge the update so it can be removed from that queue.""" + if not self.running: + return self.updateNum += 1 update = [data, self.updateNum] #log.msg("sendUpdate", update) @@ -149,6 +185,8 @@ d.addCallback(self.ackUpdate) d.addErrback(self._ackFailed, "SlaveBuilder.sendUpdate") + # these are utility routines used by sendStatus and commandComplete + def dummy(self, value): pass @@ -218,10 +256,6 @@ self.sendAllUpdates() self.sendAllCompletes() - def stopCommand(self): - if self.command: - self.command.interrupt() - self.command = None def finishCommand(self): log.msg("SlaveBuilder.finishCommand", self.command) self.remoteStep.dontNotifyOnDisconnect(self.lostRemoteStep) @@ -233,10 +267,12 @@ reactor.stop() -class Bot(pb.Referenceable): +class Bot(pb.Referenceable, service.MultiService): usePTY = None + name = "bot" def __init__(self, basedir, usePTY, not_really=0): + service.MultiService.__init__(self) self.basedir = basedir self.usePTY = usePTY self.not_really = not_really @@ -263,11 +299,13 @@ else: b = SlaveBuilder(self, name, builddir, self.not_really) b.usePTY = self.usePTY + b.setServiceParent(self) self.builders[name] = b retval[name] = b for name in self.builders.keys(): if not name in map(lambda a: a[0], wanted): log.msg("removing old builder %s" % name) + self.builder[name].disownServiceParent() del(self.builders[name]) return retval @@ -370,18 +408,33 @@ self.keepaliveTimer = None -class BuildSlave(internet.TCPClient): +class BuildSlave(service.MultiService): + botClass = Bot + def __init__(self, host, port, name, passwd, basedir, keepalive, usePTY): - bot = Bot(basedir, usePTY) + service.MultiService.__init__(self) + bot = self.botClass(basedir, usePTY) + bot.setServiceParent(self) bf = self.bf = BotFactory(keepalive) bf.startLogin(credentials.UsernamePassword(name, passwd), client=bot) - internet.TCPClient.__init__(self, host, port, bf) + self.connection = c = internet.TCPClient(host, port, bf) + c.setServiceParent(self) + + def waitUntilDisconnected(self): + # utility method for testing. Returns a Deferred that will fire when + # we lose the connection to the master. + if not self.bf.perspective: + return defer.succeed(None) + d = defer.Deferred() + self.bf.perspective.notifyOnDisconnect(lambda res: d.callback(None)) + return d def stopService(self): self.bf.continueTrying = 0 - internet.TCPClient.stopService(self) - return self._connection.disconnect() + service.MultiService.stopService(self) + # now kill the TCP connection + self.connection._connection.disconnect() class Options(usage.Options): synopsis = "Usage: mktap buildbot slave --name --passwd [options]" Index: commands.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/commands.py,v retrieving revision 1.16 retrieving revision 1.17 diff -u -d -r1.16 -r1.17 --- commands.py 28 Oct 2004 07:27:08 -0000 1.16 +++ commands.py 3 Dec 2004 22:54:52 -0000 1.17 @@ -6,10 +6,14 @@ from twisted.internet import reactor, defer from twisted.python import log, failure, runtime +from buildbot.slave.interfaces import ISlaveCommand from buildbot.slave.registry import registerSlaveCommand cvs_ver = '$Revision$'[1+len("Revision: "):-2] +# version history: +# >=1.17: commands are interruptable + class CommandInterrupted(Exception): pass class TimeoutError(Exception): @@ -206,11 +210,13 @@ def doTimeout(self): msg = "command timed out: %d seconds without output" % self.timeout + self.kill(msg) + + def kill(self, msg): msg += ", killing pid %d" % self.process.pid log.msg(msg) self.sendStatus({'header': "\n" + msg + "\n"}) - # TODO: nicer way is: SIGTERM, wait, SIGKILL, wait, freak hit = 0 if runtime.platformType == "posix": try: @@ -226,14 +232,15 @@ # probably no-such-process, maybe because there is no process # group pass - try: - log.msg("trying process.signalProcess('KILL')") - self.process.signalProcess('KILL') - log.msg(" successful") - hit = 1 - except OSError: - # could be no-such-process, because they finished very recently - pass + if not hit: + try: + log.msg("trying process.signalProcess('KILL')") + self.process.signalProcess('KILL') + log.msg(" successful") + hit = 1 + except OSError: + # could be no-such-process, because they finished very recently + pass if not hit: log.msg("signalProcess/os.kill failed both times") # finished ought to be called momentarily @@ -248,21 +255,17 @@ # be raised as pp tries to send status through .command self.commandFailed(TimeoutError("SIGKILL failed to kill process")) - def interrupt(self): - log.msg("interrupting process", self.process) - self.process.signalProcess('KILL') - # some stdout/stderr may be lost, along with the exit code - class Command: + __implements__ = ISlaveCommand, - """This class defines one command that can be invoked by the build - master. The command is executed on the slave side, and always sends back - a completion message when it finishes. It may also send intermediate - status as it runs (by calling builder.sendStatus). Some commands can be - interrupted (either by the build master or a local SIGINT), in which - case the status message indicates the step failed to complete because of - an interruption. + """This class defines one command that can be invoked by the build master. + The command is executed on the slave side, and always sends back a + completion message when it finishes. It may also send intermediate status + as it runs (by calling builder.sendStatus). Some commands can be + interrupted (either by the build master or a local timeout), in which + case the step is expected to complete normally with a status message that + indicates an error occurred. These commands are used by BuildSteps on the master side. Each kind of BuildStep uses a single Command. The slave must implement all the @@ -274,46 +277,75 @@ where 'builder' is the parent SlaveBuilder object, and 'args' is a dict that is interpreted per-command. + The setup(args) method is available for setup, and is run from __init__. + The Command is started with start(). This method must be implemented in a subclass, and it should return a Deferred. When your step is done, you should fire the Deferred (the results are not used). If the command is - interrupted, it should errback with a CommandInterrupted failure. + interrupted, it should fire the Deferred anyway. - The status messages all carry a dict, which is interpreted by the - master-side BuildStep however it likes. The completion message only - specifies whether the command was interrupted or not. If the Command - needs to return an exit code of some sort, that should be sent as a - regular status message before the completion message is sent. Once - builder.commandComplete has been run, no more status messages may be - sent.""" + While the command runs. it may send status messages back to the + buildmaster by calling self.sendStatus(statusdict). The statusdict is + interpreted by the master-side BuildStep however it likes. + + A separate completion message is sent when the deferred fires, which + indicates that the Command has finished, but does not carry any status + data. If the Command needs to return an exit code of some sort, that + should be sent as a regular status message before the deferred is fired . + Once builder.commandComplete has been run, no more status messages may be + sent. + + If interrupt() is called, the Command should attempt to shut down as + quickly as possible. Child processes should be killed, new ones should + not be started. The Command should send some kind of error status update, + then complete as usual by firing the Deferred. + + .interrupted should be set by interrupt(), and can be tested to avoid + sending multiple error status messages. + + If .running is False, the bot is shutting down (or has otherwise lost the + connection to the master), and should not send any status messages. This + is taken care of in Command.sendStatus . + + """ # builder methods: # sendStatus(dict) (zero or more) # commandComplete() or commandInterrupted() (one, at end) debug = False + interrupted = False + running = False # set by Builder, cleared on shutdown or when the + # Deferred fires def __init__(self, builder, stepId, args): self.builder = builder self.stepId = stepId # just for logging self.args = args + self.setup(args) + def setup(self, args): + """Override this in a subclass to extract items from the args dict.""" + pass + def start(self): # should return a Deferred raise NotImplementedError, "You must implement this in a subclass" def sendStatus(self, status): """Send a status update to the master.""" - if self.debug: log.msg("sendStatus", status) + if self.debug: + log.msg("sendStatus", status) + if not self.running: + log.msg("would sendStatus but not .running") + return self.builder.sendUpdate(status) - def NOTinterrupt(self): - """Stop the command. You must implement this in a subclass, then - call this parent method when it is done. After this is called, no - further status messages may be sent.""" - self.builder = None # make sure we stop sending messages - self.interrupted = 1 - self.deferred.errback(failure.Failure(CommandInterrupted())) + def interrupt(self): + """Override this in a subclass to allow commands to be interrupted. + May be called multiple times, use self.interrupted=True if this + matters.""" + pass def _abandonOnFailure(self, rc): if type(rc) is not int: @@ -375,6 +407,11 @@ d = self.command.start() return d + def interrupt(self): + self.interrupted = True + self.command.kill("command interrupted") + + registerSlaveCommand("shell", SlaveShellCommand, cvs_ver) @@ -382,18 +419,27 @@ def start(self): self.d = defer.Deferred() log.msg(" starting dummy command [%s]" % self.stepId) - reactor.callLater(1, self.doStatus) + self.timer = reactor.callLater(1, self.doStatus) return self.d + def interrupt(self): + self.timer.cancel() + self.timer = None + self.interrupted = True + self.finished() + def doStatus(self): log.msg(" sending intermediate status") self.sendStatus({'stdout': 'data'}) timeout = self.args.get('timeout', 5) + 1 - reactor.callLater(timeout - 1, self.finished) + self.timer = reactor.callLater(timeout - 1, self.finished) def finished(self): log.msg(" dummy command finished [%s]" % self.stepId) - self.sendStatus({'rc': 0}) + if self.interrupted: + self.sendStatus({'rc': 1}) + else: + self.sendStatus({'rc': 0}) self.d.callback(0) registerSlaveCommand("dummy", DummyCommand, cvs_ver) @@ -426,21 +472,19 @@ """ - def __init__(self, builder, stepId, args): - Command.__init__(self, builder, stepId, args) + def setup(self, args): self.workdir = args['workdir'] self.mode = args.get('mode', "update") self.revision = args.get('revision') self.patch = args.get('patch') self.timeout = args.get('timeout', 120) - self.setup(args) - - def setup(self, args): - """Override this in the VC-specific subclass to extract more args""" - pass + # VC-specific subclasses should override this to extract more args. + # Make sure to upcall! def start(self): self.sendStatus({'header': "starting " + self.header + "\n"}) + self.command = None + # self.srcdir is where the VC system should put the sources if self.mode == "copy": self.srcdir = "source" # hardwired directory name, sorry @@ -466,6 +510,11 @@ d.addCallbacks(self._sendRC, self._checkAbandoned) return d + def interrupt(self): + self.interrupted = True + if self.command: + self.command.kill("command interrupted") + def doVC(self, res): if self.sourcedirIsUpdateable(): d = self.doVCUpdate() @@ -484,6 +533,8 @@ def maybeDoVCFallback(self, rc): if type(rc) is int and rc == 0: return rc + if self.interrupted: + raise AbandonChain(1) msg = "update failed, clobbering and trying again" self.sendStatus({'header': msg + "\n"}) log.msg(msg) @@ -577,6 +628,7 @@ header = "cvs operation" def setup(self, args): + SourceBase.setup(self, args) self.cvsroot = args['cvsroot'] self.cvsmodule = args['cvsmodule'] self.global_options = args.get('global_options', []) @@ -651,6 +703,7 @@ header = "svn operation" def setup(self, args): + SourceBase.setup(self, args) self.svnurl = args['svnurl'] def sourcedirIsUpdateable(self): @@ -695,6 +748,7 @@ header = "darcs operation" def setup(self, args): + SourceBase.setup(self, args) self.repourl = args['repourl'] def sourcedirIsUpdateable(self): @@ -741,6 +795,7 @@ buildconfig = None def setup(self, args): + SourceBase.setup(self, args) self.url = args['url'] self.version = args['version'] @@ -825,6 +880,7 @@ header = "p4 sync" def setup(self, args): + SourceBase.setup(self, args) self.p4port = args['p4port'] def sourcedirIsUpdateable(self): --- NEW FILE: interfaces.py --- #! /usr/bin/python from twisted.python.components import Interface class ISlaveCommand(Interface): """This interface is implemented by all of the buildslave's Command subclasses. It specifies how the buildslave can start, interrupt, and query the various Commands running on behalf of the buildmaster.""" def __init__(builder, stepId, args): """Create the Command. 'builder' is a reference to the parent buildbot.bot.SlaveBuilder instance, which will be used to send status updates (by calling builder.sendStatus). 'stepId' is a random string which helps correlate slave logs with the master. 'args' is a dict of arguments that comes from the master-side BuildStep, with contents that are specific to the individual Command subclass. This method is not intended to be subclassed.""" def setup(args): """This method is provided for subclasses to override, to extract parameters from the 'args' dictionary. The default implemention does nothing. It will be called from __init__""" def start(): """Begin the command, and return a Deferred. While the command runs, it should send status updates to the master-side BuildStep by calling self.sendStatus(status). The 'status' argument is typically a dict with keys like 'stdout', 'stderr', and 'rc'. When the step completes, it should fire the Deferred (the results are not used). If an exception occurs during execution, it may also errback the deferred, however any reasonable errors should be trapped and indicated with a non-zero 'rc' status rather than raising an exception. Exceptions should indicate problems within the buildbot itself, not problems in the project being tested. """ def interrupt(): """This is called to tell the Command that the build is being stopped and therefore the command should be terminated as quickly as possible. The command may continue to send status updates, up to and including an 'rc' end-of-command update (which should indicate an error condition). The Command's deferred should still be fired when the command has finally completed. If the build is being stopped because the slave it shutting down or because the connection to the buildmaster has been lost, the status updates will simply be discarded. The Command does not need to be aware of this. Child shell processes should be killed. Simple ShellCommand classes can just insert a header line indicating that the process will be killed, then os.kill() the child.""" From warner at users.sourceforge.net Fri Dec 3 22:54:53 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Fri, 03 Dec 2004 22:54:53 +0000 Subject: [Buildbot-commits] buildbot/buildbot interfaces.py,1.20,1.21 master.py,1.54,1.55 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot Modified Files: interfaces.py master.py Log Message: Make commands (and builds) interruptible. Improve lost-slave behavior. Merging in several days of changes from local Arch branch, see ChangeLog for details about individual files. Index: interfaces.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/interfaces.py,v retrieving revision 1.20 retrieving revision 1.21 diff -u -d -r1.20 -r1.21 --- interfaces.py 15 Oct 2004 16:59:44 -0000 1.20 +++ interfaces.py 3 Dec 2004 22:54:50 -0000 1.21 @@ -603,8 +603,18 @@ further control the new build, or from which an IBuildStatus object can be obtained.""" - def ping(): - """Attempt to contact the slave and see if it is still alive.""" + def getBuild(number): + """Attempt to return an IBuildControl object for the given build. + Returns None if no such object is available. This will only work for + the build that is currently in progress: once the build finishes, + there is nothing to control anymore.""" + + def ping(timeout=30): + """Attempt to contact the slave and see if it is still alive. This + returns a Deferred which fires with either True (the slave is still + alive) or False (the slave did not respond). As a side effect, adds + an event to this builder's column in the waterfall display + containing the results of the ping.""" # TODO: this ought to live in ISlaveControl, maybe with disconnect() # or something. However the event that is emitted is most useful in # the Builder column, so it kinda fits here too. @@ -612,3 +622,7 @@ class IBuildControl(Interface): def getStatus(): """Return an IBuildStatus object for the Build that I control.""" + def stopBuild(reason=""): + """Halt the build. This has no effect if the build has already + finished.""" + Index: master.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/master.py,v retrieving revision 1.54 retrieving revision 1.55 diff -u -d -r1.54 -r1.55 --- master.py 14 Oct 2004 16:47:33 -0000 1.54 +++ master.py 3 Dec 2004 22:54:50 -0000 1.55 @@ -38,28 +38,30 @@ class BotPerspective(NewCredPerspective): """This is the master-side representative for a remote buildbot slave. - When buildbots connect in, they get a reference to a new instance of - this class. The BotMaster object is stashed as the .service - attribute.""" + There is exactly one for each slave described in the config file (the + c['bots'] list). When buildbots connect in (.attach), they get a + reference to this instance. The BotMaster object is stashed as the + .service attribute.""" slave_commands = None - def __init__(self, slavename, builders, slave_status): - self.slavename = slavename - self.builders = builders - self.slave_status = slave_status + def __init__(self, name): + self.slavename = name + self.slave_status = SlaveStatus(name) + self.builders = [] # list of b.p.builder.Builder instances + self.slave = None # a RemoteReference to the Bot, when connected def addBuilder(self, builder): """Called to add a builder after the slave has connected.""" self.builders.append(builder) - # TODO: resync with slave, to accomodate builders added after - # attach - self.sendBuilderList() + if self.slave: + self.sendBuilderList() def removeBuilder(self, builder): self.builders.remove(builder) - builder.detached() - self.sendBuilderList() + if self.slave: + builder.detached() + self.sendBuilderList() def __repr__(self): return "" % \ @@ -67,14 +69,78 @@ string.join(map(lambda b: b.name, self.builders), ',')) def attached(self, mind): - # this is called shortly after the slave connects. We go through a - # sequence of calls, gathering information, then tell our Builders - # that they have a slave to work with. + # this is called when the slave connects. It returns a Deferred that + # fires with a suitable pb.IPerspective to give to the slave (i.e. + # 'self') + + if self.slave: + # uh-oh, we've got a duplicate slave. The most likely + # explanation is that the slave is behind a slow link, thinks we + # went away, and has attempted to reconnect, so we've got two + # "connections" from the same slave, but the previous one is + # stale. Give the new one precedence. + log.msg("duplicate slave %s replacing old one" % self.slavename) + + # just in case we've got two identically-configured slaves, + # report the IP addresses of both so someone can resolve the + # squabble + tport = self.slave.broker.transport + log.msg("old slave was connected from", tport.getPeer()) + log.msg("new slave is from", mind.broker.transport.getPeer()) + d = self.disconnect() + d.addCallback(lambda res: self._attached(mind)) + return d + + self._attached(mind) + return defer.succeed(self) + + def disconnect(self): + log.msg("disconnecting old slave %s now" % self.slavename) + + # all kinds of teardown will happen as a result of + # loseConnection(), but it happens after a reactor iteration or + # two. Hook the actual disconnect so we can know when it is safe + # to connect the new slave. We have to wait one additional + # iteration (with callLater(0)) to make sure the *other* + # notifyOnDisconnect handlers have had a chance to run. + d = defer.Deferred() + + self.slave.notifyOnDisconnect(lambda res: # TODO: d=d ? + reactor.callLater(0, d.callback, None)) + tport = self.slave.broker.transport + # this is the polite way to request that a socket be closed + tport.loseConnection() + try: + # but really we don't want to wait for the transmit queue to + # drain. The remote end is unlikely to ACK the data, so we'd + # probably have to wait for a (20-minute) TCP timeout. + #tport._closeSocket() + # however, doing _closeSocket (whether before or after + # loseConnection) somehow prevents the notifyOnDisconnect + # handlers from being run. Bummer. + tport.offset = 0 + tport.dataBuffer = "" + pass + except: + # however, these hacks are pretty internal, so don't blow up if + # they fail or are unavailable + log.msg("failed to accelerate the shutdown process") + pass + log.msg("waiting for slave to finish disconnecting") + + # When this Deferred fires, we'll be ready to accept the new slave + return d + + def _attached(self, mind): + # We go through a sequence of calls, gathering information, then + # tell our Builders that they have a slave to work with. self.slave = mind self.slave.callRemote("print", "attached").addErrback(lambda why: 0) self.slave_status.connected = True log.msg("bot attached") + # TODO: there is a window here (while we're retrieving slaveinfo) + # during which a disconnect or a duplicate-slave will be confusing d = self.slave.callRemote("getSlaveInfo") d.addCallback(self.got_info) d.addErrback(self.infoUnavailable) @@ -149,13 +215,16 @@ for name, remote in blist.items(): for b in self.builders: if b.name == name: + # if we sent the builders list because of a config + # change, the Builder might already be attached. + # Builder.attached will ignore us if this happens. b.attached(remote, self.slave_commands) continue def _listFailed(self, why): log.msg("BotPerspective._listFailed") log.err(why) - # TODO: hand up on them, without setBuilderList we can't use them + # TODO: hang up on them, without setBuilderList we can't use them def perspective_forceBuild(self, name, who=None): # slave admins are allowed to force any of their own builds @@ -176,13 +245,11 @@ pass def detached(self, mind): + self.slave = None self.slave_status.connected = False - self.botmaster.detach(self) for b in self.builders: b.detached() - self.builders = [] - log.msg("bot detached") - # this perspective goes away now + log.msg("Botmaster.detached(%s)" % self.slavename) class BotMaster(service.Service): @@ -204,8 +271,7 @@ # which is the master-side object that defines and controls a build. # They are added by calling botmaster.addBuilder() from the startup # code. - self.slaves = {} - self.slaveStatus = {} + self.slaves = {} # maps slavename to BotPerspective self.interlocks = {} self.statusClientService = None self.watchers = {} @@ -220,10 +286,20 @@ def waitUntilBuilderDetached(self, name): # convenience function for testing d = defer.Deferred() - b = self.builders[name] + b = self.builders.get(name, None) + if not b or not b.remote: + return defer.succeed(None) b.watchers['detach'].append(d) return d + def addSlave(self, slavename): + slave = BotPerspective(slavename) + self.slaves[slavename] = slave + + def removeSlave(self, slavename): + d = self.slaves[slavename].disconnect + del self.slaves[slavename] + def getBuildernames(self): return self.builderNames @@ -233,24 +309,26 @@ that build: the builds cannot be done until the right slave connects.""" if self.debug: print "addBuilder", builder + log.msg("Botmaster.addBuilder(%s)" % builder.name) + if builder.name in self.builderNames: - raise KeyError, "muliply defined builder '%s'" % builder.name + raise KeyError("muliply defined builder '%s'" % builder.name) + slavename = builder.slavename + if not self.slaves.has_key(slavename): + raise KeyError("builder %s uses undefined slave %s" % \ + (builder.name, slavename)) + self.builders[builder.name] = builder self.builderNames.append(builder.name) builder.setBotmaster(self) self.checkInactiveInterlocks() # TODO?: do this in caller instead? - if not self.slaveStatus.has_key(builder.slavename): - # this is a new slave, create a SlaveStatus object for it - s = SlaveStatus(builder.slavename) - self.slaveStatus[builder.slavename] = s - slave = self.slaves.get(builder.slavename) - if slave: - # there is an active slave which needs to be informed about the - # new builder - slave.addBuilder(builder) + + slave = self.slaves[slavename] + slave.addBuilder(builder) def removeBuilder(self, builder): if self.debug: print "removeBuilder", builder + log.msg("Botmaster.removeBuilder(%s)" % builder.name) b = self.builders[builder.name] # any linked interlocks will be made inactive before the builder is # removed @@ -266,18 +344,9 @@ i.deactivate(self.builders) del self.builders[builder.name] self.builderNames.remove(builder.name) - # check for an active slave to remove the builder from - for slavename, slave in self.slaves.items(): - if slavename == builder.slavename: - slave.removeBuilder(builder) - # now see if this was the last builder to use the slave - used = False - for b in self.builders.values(): - if b.slavename == builder.slavename: - used = True - break - if not used: - del self.slaveStatus[builder.slavename] + slave = self.slaves.get(builder.slavename) + if slave: + slave.removeBuilder(builder) def addInterlock(self, interlock): """This is called by the setup code to create build interlocks: @@ -312,40 +381,7 @@ interlock.deactivate(self.builders) def getPerspective(self, slavename): - if self.slaves.has_key(slavename): - # uh-oh, we've got a duplicate slave. Try to figure out where the - # old one is coming from so we can explain the problem - log.msg("duplicate slave %s trying to connect" % slavename) - addr = self.slaves[slavename].slave.broker.transport.getPeer() - log.msg("old slave is connected from", addr) - # unfortunately the slave doesn't currently emit this message - raise ValueError("duplicate slave, old one connected from %s" \ - % addr) - - slave_status = self.slaveStatus.get(slavename) - if not slave_status: - # TODO: this is probably broken w.r.t slaves connecting before - # their builders have been configured, or vice versa - slave_status = SlaveStatus(slavename) - self.slaveStatus[slavename] = slave_status - slave_status.connected = True - - # Find all the builders that want to use this slave - builders = [b for (name, b) in self.builders.items() - if b.slavename == slavename] - p = BotPerspective(slavename, builders, slave_status) - p.botmaster = self - self.slaves[slavename] = p - return p - - def detach(self, p): - if not self.slaves[p.slavename] == p: - # TODO: I saw this happen, but I don't know why - log.msg("WEIRD, wrong slave '%s' saying goodbye" % p.slavename) - log.msg(" original:", self.slaves[p.slavename]) - log.msg(" detaching:", p) - self.slaveStatus[p.slavename].connected = False - del self.slaves[p.slavename] + return self.slaves[slavename] def addChange(self, change): for b in self.builders.values(): @@ -459,6 +495,8 @@ def requestAvatar(self, avatarID, mind, interface): assert interface == pb.IPerspective + log.msg("requestAvatar(%s) from %s" % \ + (avatarID, mind.broker.transport.getPeer())) afactory = self.names.get(avatarID) if afactory: p = afactory.getPerspective() @@ -474,14 +512,14 @@ p = self.botmaster.getPerspective(avatarID) if not p: - raise ValueError, "no perspective for '%s'" % avatarID - p.attached(mind) # perhaps .callLater(0) ? - # TODO: especially for BotPerspectives - # TODO: the slave might be removed from BotMaster.slaves by the time - # the .detached callback is run, causing the assert in - # BotMaster.detach to fail - return (pb.IPerspective, p, - lambda p=p,mind=mind: p.detached(mind)) + raise ValueError("no perspective for '%s'" % avatarID) + + d = defer.maybeDeferred(p.attached, mind) + d.addCallback(self._avatarAttached, mind) + return d + + def _avatarAttached(self, p, mind): + return (pb.IPerspective, p, lambda p=p,mind=mind: p.detached(mind)) ######################################## @@ -537,6 +575,7 @@ self.statusTargets = [] + self.bots = [] self.sources = [] self.readConfig = False @@ -689,6 +728,12 @@ raise TypeError, "webPortnum '%s' must be an int" % webPortnum for s in status: assert interfaces.IStatusReceiver(s) + if 0: # tuple-specified builders are a problem + slavenames = [name for name,pw in bots] + for b in builders: + if b['slavename'] not in slavenames: + raise ValueError("builder %s uses undefined slave %s" \ + % (b['name'], b['slavename'])) # now we're committed to implementing the new configuration, so do # it atomically @@ -700,13 +745,7 @@ # self.bots: Disconnect any that were attached and removed from the # list. Update self.checker with the new list of passwords, # including debug/change/status. - self.checker.users = {} # violates abstraction, oh well - for user, passwd in bots: - self.checker.addUser(user, passwd) - self.checker.addUser("change", "changepw") - - # TODO: hang up on old bots - self.bots = bots + self.loadConfig_Slaves(bots) # self.debugPassword if debugPassword: @@ -728,15 +767,7 @@ self.manhole = manhole manhole.setServiceParent(self) - # self.sources: shut down any that were removed, start any that were - # added - old = self.sources - new = sources - [self.change_svc.removeSource(source) - for source in old if source not in new] - [self.change_svc.addSource(source) - for source in new if source not in old] - self.sources = sources + self.loadConfig_Sources(sources) # add/remove self.botmaster.builders to match builders. The # botmaster will handle startup/shutdown issues. @@ -761,6 +792,35 @@ log.msg("configuration updated") + def loadConfig_Slaves(self, bots): + # set up the Checker with the names and passwords of all valid bots + self.checker.users = {} # violates abstraction, oh well + for user, passwd in bots: + self.checker.addUser(user, passwd) + self.checker.addUser("change", "changepw") + + # identify new/old bots + old = self.bots; oldnames = [name for name,pw in old] + new = bots; newnames = [name for name,pw in new] + # removeSlave will hang up on the old bot + [self.botmaster.removeSlave(name) + for name in oldnames if name not in newnames] + [self.botmaster.addSlave(name) + for name in newnames if name not in oldnames] + + # all done + self.bots = bots + + def loadConfig_Sources(self, sources): + # shut down any that were removed, start any that were added + old = self.sources + new = sources + [self.change_svc.removeSource(source) + for source in old if source not in new] + [self.change_svc.addSource(source) + for source in new if source not in old] + self.sources = sources + def loadConfig_Builders(self, newBuilders): old = self.botmaster.getBuildernames() newNames = [] From warner at users.sourceforge.net Fri Dec 3 22:54:52 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Fri, 03 Dec 2004 22:54:52 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.319,1.320 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004 Modified Files: ChangeLog Log Message: Make commands (and builds) interruptible. Improve lost-slave behavior. Merging in several days of changes from local Arch branch, see ChangeLog for details about individual files. Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.319 retrieving revision 1.320 diff -u -d -r1.319 -r1.320 --- ChangeLog 28 Nov 2004 01:53:52 -0000 1.319 +++ ChangeLog 3 Dec 2004 22:54:50 -0000 1.320 @@ -1,3 +1,140 @@ +2004-12-03 Brian Warner + + * buildbot/master.py: clean up slave-handling code, to handle + slave-disconnect and multiple-connect better + (BotPerspective): make these long-lasting, exactly one per bot + listed in the config file. + (BotPerspective.attached): if a slave connects while an existing + one appears to still be connected, disconnect the old one first. + (BotPerspective.disconnect): new method to forcibly disconnect a + buildslave. Use some hacks to empty the transmit buffer quickly to + avoid the long (20-min?) TCP timeout that could occur if the old + slave has dropped off the net. + (BotMaster): Keep persistent BotPerspectives in .slaves, let them + own their own SlaveStatus objects. Remove .attached/.detached, add + .addSlave/.removeSlave, treat slaves like Builders (config file + parsing sends deltas to the BotMaster). Inform the slave + instances, i.e. the BotPerspective, about addBuilder and + removeBuilder. + (BotMaster.getPerspective): turns into a single dict lookup + (Dispatcher.requestAvatar): allow .attached to return a Deferred, + which gives BotPerspective.attached a chance to disconnect the old + slave first. + (BuildMaster.loadConfig): add code (disabled) to validate that all + builders use known slaves (listed in c['bots']). The check won't + work with tuple-specified builders, which are deprecated but not + yet invalid, so the check is disabled for now. + (BuildMaster.loadConfig_Slaves): move slave-config into a separate + routine, do the add/changed/removed dance with them like we do + with builders. + (BuildMaster.loadConfig_Sources): move source-config into a + separate routine too + + * buildbot/status/builder.py (Status.getSlave): get the + SlaveStatus object from the BotPerspective, not the BotMaster. + + * buildbot/test/test_run.py: bunch of new tests for losing the + buildslave at various points in the build, handling a slave that + connects multiple times, and making sure we can interrupt a + running build + + * buildbot/slave/bot.py (BuildSlave): make it possible to use + something other than 'Bot' for the Bot object, to make certain + test cases easier to write. + (BuildSlave.waitUntilDisconnected): utility method for testing + +2004-11-30 Brian Warner + + * buildbot/test/test_run.py (RunMixin): refactor, remove debug msg + + * buildbot/interfaces.py (IBuilderControl.ping): add timeout= + argument, return a Deferred that always fires with True or False. + I don't use an errback to indicate 'ping failed' so that callers + are free to ignore the deferred without causing spurious errors in + the logs. + * buildbot/process/builder.py (BuilderControl.ping): implement it + + * buildbot/test/test_run.py (Status.testDisappear): test ping + (Status.disappearSlave): fix it + +2004-11-30 Brian Warner + + * buildbot/interfaces.py (IBuildControl): add .stopBuild + (IBuilderControl): add .getBuild(num), only works for the current + build, of course, although it might be interesting to offer + something for builds in the .waiting or .interlocked state. + + * buildbot/process/base.py (Build): have .stopBuild just do the + interrupt, then let the build die by itself. + (BuildControl): add .stopBuild, and add a point-event named + 'interrupt' just after the build so status viewers can tell that + someone killed it. + (BuilderControl): add .getBuild + + * buildbot/process/step.py (Dummy): use haltOnFailure so it really + stops when you kill it, good for testing + (ShellCommand.interrupt): add a logfile named 'interrupt' which + contains the 'reason' text. + + * buildbot/status/html.py: Add Stop Build button, if the build can + still be stopped. Send a Redirect (to the top page) one second + later, hopefully long enough for the interrupt to have an effect. + Move make_row() up to top-level to share it between Stop Build and + Force Build. + + * buildbot/slave/commands.py: only kill the child process once + + * buildbot/test/test_run.py: add testInterrupt + +2004-11-29 Brian Warner + + * buildbot/process/base.py: Refactor command interruption. The + Build is now responsible for noticing that the slave has gone + away: Build.lostRemote() interrupts the current step and makes + sure that no further ones will be started. + + * buildbot/process/builder.py: When the initial remote_startBuild + message fails, log it: this usually indicates that the slave has + gone away, but we don't really start paying attention until they + fail to respond to the first step's command. + + * buildbot/process/step.py (RemoteCommand): Does *not* watch for + slave disconnect. Now sports a new interrupt() method. Error + handling was simplified a lot by chaining deferreds, so + remoteFailed/remoteComplete were merged into a single + remoteComplete method (which can now get a Failure object). + Likewise failed/finished were merged into just _finished. + (BuildStep): Add interrupt(why) method, and if why is a + ConnectionLost Failure then the step is failed with some useful + error text. + + * buildbot/slave/bot.py: stop the current command when the remote + Step reference is lost, and when the slave is shut down. + (Bot): make it a MultiService, so it can have children. Use + stopService to tell when the slave is shutting down. + (SlaveBuilder): make it a Service, and a child of the Bot. Add + remote_interruptCommand (which asks the current SlaveCommand to + stop but allows it to keep emitting status messages), and + stopCommand (which tells it to shut up and die). + + * buildbot/slave/commands.py: make commands interruptible + (ShellCommand.kill): factor out os.kill logic + (Command): factor out setup() + (Command.sendStatus): don't send status if .running is false, this + happens when the command has been halted. + (Command.interrupt): new method, used to tell the command to die + (SlaveShellCommand): implement .interrupt + (DummyCommand): implement .interrupt + (SourceBase, etc): factor out setup(), don't continue substeps if + .interrupted is set + + * buildbot/status/builder.py: fix all waitUntilFinished() methods + so they can be called after finishing + + * buildbot/test/test_run.py: new tests for disconnect behavior, + refactor slave-shutdown routines, add different kinds of + slave-shutdown + 2004-11-27 Brian Warner * buildbot/status/words.py (IrcStatusBot.convertTime): utility From warner at users.sourceforge.net Sat Dec 4 21:02:06 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 04 Dec 2004 21:02:06 +0000 Subject: [Buildbot-commits] buildbot/buildbot/slave commands.py,1.17,1.18 bot.py,1.4,1.5 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/slave In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv14277/buildbot/slave Modified Files: commands.py bot.py Log Message: * buildbot/slave/bot.py: clean up shutdown/lose-master code (SlaveBuilder): make some attributes class-level, remove the old "update queue" which existed to support resuming a build after the master connection was lost. Try to reimplement that feature later. (SlaveBuilder.stopCommand): clear self.command when the SlaveCommand finishes, so that we don't try to kill a leftover one at shutdown time. (SlaveBuilder.commandComplete): same, merge with commandFailed and .finishCommand * buildbot/slave/commands.py (SourceBase): set self.command for all VC commands, so they can be interrupted. Index: bot.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/bot.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -d -r1.4 -r1.5 --- bot.py 3 Dec 2004 22:54:52 -0000 1.4 +++ bot.py 4 Dec 2004 21:02:03 -0000 1.5 @@ -44,32 +44,27 @@ stopCommandOnShutdown = True + # remote is a ref to the Builder object on the master side, and is set + # when they attach. We use it to detect when the connection to the master + # is severed. + remote = None + + # .build points to a SlaveBuild object, a new one for each build + build = None + + # .command points to a SlaveCommand instance, and is set while the step + # is running. We use it to implement the stopBuild method. + command = None + + # .remoteStep is a ref to the master-side BuildStep object, and is set + # when the step is started + remoteStep = None + def __init__(self, parent, name, builddir, not_really): #service.Service.__init__(self) self.name = name self.builddir = builddir self.not_really = not_really - self.remote = None - # remote is a ref to the Builder object on the master side, and is - # set when they attach. It really isn't used very much - self.build = None - # .build points to a SlaveBuild object, a new one for each build - self.command = None - # .command points to a SlaveCommand instance, and is set when the - # step is started. It remains until the step is completed and the - # completion is acknowledged (via .ackComplete) - self.remoteStep = None - # .remoteStep is a ref to the master-side BuildStep object, and is - # set when the step is started - self.updateNum = None - # .updateNum counts status updates. It is reset to zero at the - # beginning of each step. Numbering the updates makes it possible to - # reattach to a master that has been restarted - self.updateQueue = [] - # unacknowledged status updates are kept in .updateQueue, and are - # removed by .ackUpdate - self.complete = None - # an unacknowledged completion message lives in .complete def __repr__(self): return "" % self.name @@ -83,9 +78,8 @@ def stopService(self): service.Service.stopService(self) - if self.command and self.stopCommandOnShutdown: + if self.stopCommandOnShutdown: self.stopCommand() - self.command = None def remote_setMaster(self, remote): self.remote = remote @@ -100,7 +94,7 @@ def lostRemoteStep(self, remotestep): log.msg("lost remote step") self.remoteStep = None - if self.command and self.stopCommandOnShutdown: + if self.stopCommandOnShutdown: self.stopCommand() # the following are Commands that can be invoked by the master-side @@ -111,6 +105,7 @@ one step to the next.""" self.build = SlaveBuild(self) log.msg("startBuild") + def remote_startCommand(self, stepref, stepId, command, args): """This is called multiple times by various master-side BuildSteps, to start various commands that actually do the build.""" @@ -118,7 +113,6 @@ if self.command: log.msg("leftover command, dropping it") self.stopCommand() - self.command = None try: factory, version = registry.commandRegistry[command] @@ -127,14 +121,12 @@ self.command = factory(self, stepId, args) log.msg(" startCommand:%s [id %s]" % (command,stepId)) - self.updateQueue = [] self.remoteStep = stepref self.remoteStep.notifyOnDisconnect(self.lostRemoteStep) - self.updateNum = 0 - self.complete = None self.command.running = True d = defer.maybeDeferred(self.command.start) - d.addCallbacks(self.commandComplete, self.commandFailed) + d.addCallback(lambda res: None) + d.addBoth(self.commandComplete) return None def remote_interruptCommand(self, stepId, why): @@ -147,120 +139,76 @@ return self.command.interrupt() + def stopCommand(self): + """Make any currently-running command die, with no further status + output. This is used when the buildslave is shutting down or the + connection to the master has been lost. Interrupt the command, + silence it, and then forget about it.""" if not self.command: return - self.command.running = False - if not self.command.interrupted: - self.command.interrupt() - + log.msg("stopCommand: halting current command %s" % self.command) + self.command.running = False # shut up! + self.command.interrupt() # die! + self.command = None # forget you! - # these two are fired by the Deferred attached to each Command - def commandComplete(self, dummy): - if not self.running: - return - self.sendComplete() - def commandFailed(self, why): - if not self.running: - return - log.msg("commandFailed") - log.err(why) - self.sendComplete(why) # sendUpdate is invoked by the Commands we spawn - def sendUpdate(self, data=None): + def sendUpdate(self, data): """This sends the status update to the master-side BuildStep object, giving it a sequence number in the process. It adds the update to a queue, and asks the master to acknowledge the update so it can be removed from that queue.""" if not self.running: + # .running comes from service.Service, and says whether the + # service is running or not. If we aren't running, don't send any + # status messages. return - self.updateNum += 1 - update = [data, self.updateNum] - #log.msg("sendUpdate", update) - self.updateQueue.append(update) - if self.remoteStep: # ?? send to Builder or BuildStep? + # the update[1]=0 comes from the leftover 'updateNum', which the + # master still expects to receive. Provide it to avoid significant + # interoperability issues between new slaves and old masters. + if self.remoteStep: + update = [data, 0] updates = [update] d = self.remoteStep.callRemote("update", updates) d.addCallback(self.ackUpdate) d.addErrback(self._ackFailed, "SlaveBuilder.sendUpdate") - # these are utility routines used by sendStatus and commandComplete - - def dummy(self, value): + def ackUpdate(self, acknum): + # TODO: update the "last activity" timer pass - def ackUpdate(self, acknum): - """Normally, the master responds to remote_update by returning the - update number of the highest contiguous update received. That number - comes back to this routine, which removes the acknowledged updates - from the queue.""" - # XXX: revamp this, I think it needs a retransmission timeout to - # deal with sendAllUpdates that don't all get acknowledged. Might be - # ok, though. - unacked = [] - for update in self.updateQueue: - (data, updatenum) = update - if updatenum > acknum: - unacked.append(update) - self.updateQueue = unacked - # also, if the terminal status message (resulting from - # commandComplete or commandFailed) is acked, we can finally get rid - # of the command by clearing .stepRef and .command. We have to do - # this, otherwise we'll think we're still running the command and - # won't be able to answer remote_reattach correctly. XXX: is that - # true? + def ackComplete(self, dummy): + # TODO: update the "last activity" timer + pass def _ackFailed(self, why, where): log.msg("SlaveBuilder._ackFailed:", where) #log.err(why) # we don't really care - def sendAllUpdates(self): - """This is called after reattachment to send all queued updates.""" - if self.updateQueue and self.remoteStep: - d = self.remoteStep.callRemote("update", self.updateQueue) - d.addCallback(self.ackUpdate) - d.addErrback(self._ackFailed, "SlaveBuilder.sendAllUpdates") - def sendComplete(self, failure=None): - # failure, if present, is a failure.Failure. To send it across the - # wire, we must turn it into a pb.CopyableFailure. + # this is fired by the Deferred attached to each Command + def commandComplete(self, failure): if failure: + log.msg("SlaveBuilder.commandFailed", self.command) + log.err(why) + # failure, if present, is a failure.Failure. To send it across + # the wire, we must turn it into a pb.CopyableFailure. failure = pb.CopyableFailure(failure) failure.unsafeTracebacks = True - self.complete = [failure] + else: + # failure is None + log.msg("SlaveBuilder.commandComplete", self.command) + self.command = None + if not self.running: + return if self.remoteStep: + self.remoteStep.dontNotifyOnDisconnect(self.lostRemoteStep) d = self.remoteStep.callRemote("complete", failure) d.addCallback(self.ackComplete) d.addErrback(self._ackFailed, "sendComplete") + self.remoteStep = None - def ackComplete(self, dummy): - # this is the call that finally finishes the step - self.finishCommand() - - def sendAllCompletes(self): - if self.complete and self.remoteStep: - d = self.remoteStep.callRemote("complete", self.complete[0]) - d.addCallback(self.ackComplete) - d.addErrback(self._ackFailed, "sendAllCompletes") - - def remote_reattach(self, stepref, stepId): - # were we executing something? - if not self.command: - raise NoCommandRunning - # were we executing the same thing that they think we were? - if self.command.stepId != stepId: - raise WrongCommandRunning - # send them our unacked status - self.remoteStep = stepref - self.sendAllUpdates() - self.sendAllCompletes() - - def finishCommand(self): - log.msg("SlaveBuilder.finishCommand", self.command) - self.remoteStep.dontNotifyOnDisconnect(self.lostRemoteStep) - self.remoteStep = None - self.command = None def remote_shutdown(self): print "slave shutting down on command from master" Index: commands.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/commands.py,v retrieving revision 1.17 retrieving revision 1.18 diff -u -d -r1.17 -r1.18 --- commands.py 3 Dec 2004 22:54:52 -0000 1.17 +++ commands.py 4 Dec 2004 21:02:03 -0000 1.18 @@ -305,7 +305,7 @@ If .running is False, the bot is shutting down (or has otherwise lost the connection to the master), and should not send any status messages. This - is taken care of in Command.sendStatus . + is checked in Command.sendStatus . """ @@ -329,7 +329,11 @@ pass def start(self): - # should return a Deferred + """Start the command. self.running will be set just before this is + called. This method should return a Deferred that will fire when the + command has completed. + + This method should be overridden by subclasses.""" raise NotImplementedError, "You must implement this in a subclass" def sendStatus(self, status): @@ -343,10 +347,12 @@ def interrupt(self): """Override this in a subclass to allow commands to be interrupted. - May be called multiple times, use self.interrupted=True if this - matters.""" + May be called multiple times, test and set self.interrupted=True if + this matters.""" pass + # utility methods, mostly used by SlaveShellCommand and the like + def _abandonOnFailure(self, rc): if type(rc) is not int: log.msg("weird, _abandonOnFailure was given rc=%s (%s)" % \ @@ -423,6 +429,8 @@ return self.d def interrupt(self): + if self.interrupted: + return self.timer.cancel() self.timer = None self.interrupted = True @@ -578,6 +586,7 @@ command = ["rm", "-rf", d] c = ShellCommand(self.builder, command, self.builder.basedir, sendRC=0, timeout=self.timeout) + self.command = c # sendRC=0 means the rm command will send stdout/stderr to the # master, but not the rc=0 when it finishes. That job is left to # _sendRC @@ -595,6 +604,7 @@ command = ['cp', '-r', fromdir, todir] c = ShellCommand(self.builder, command, self.builder.basedir, sendRC=False, timeout=self.timeout) + self.command = c d = c.start() d.addCallback(self._abandonOnFailure) return d @@ -609,6 +619,7 @@ c = ShellCommand(self.builder, command, dir, sendRC=False, timeout=self.timeout, stdin=diff) + self.command = c d = c.start() d.addCallback(self._abandonOnFailure) return d @@ -651,6 +662,7 @@ c = ShellCommand(self.builder, command, d, sendRC=False, timeout=self.timeout, stdin=self.login+"\n") + self.command = c d = c.start() d.addCallback(self._abandonOnFailure) d.addCallback(self._didLogin) @@ -671,6 +683,7 @@ command += ['-D', self.revision] c = ShellCommand(self.builder, command, d, sendRC=False, timeout=self.timeout) + self.command = c return c.start() def doVCFull(self): @@ -689,6 +702,7 @@ command += [self.cvsmodule] c = ShellCommand(self.builder, command, d, sendRC=False, timeout=self.timeout) + self.command = c return c.start() registerSlaveCommand("cvs", CVS, cvs_ver) @@ -720,6 +734,7 @@ command = ['svn', 'update', '--revision', str(revision)] c = ShellCommand(self.builder, command, d, sendRC=False, timeout=self.timeout) + self.command = c return c.start() def doVCFull(self): @@ -734,6 +749,7 @@ self.svnurl, self.srcdir] c = ShellCommand(self.builder, command, d, sendRC=False, timeout=self.timeout) + self.command = c return c.start() registerSlaveCommand("svn", SVN, cvs_ver) @@ -765,6 +781,7 @@ command = ['darcs', 'pull', '--all', '--verbose'] c = ShellCommand(self.builder, command, d, sendRC=False, timeout=self.timeout) + self.command = c return c.start() def doVCFull(self): @@ -776,6 +793,7 @@ self.repourl] c = ShellCommand(self.builder, command, d, sendRC=False, timeout=self.timeout) + self.command = c return c.start() registerSlaveCommand("darcs", Darcs, cvs_ver) @@ -812,6 +830,7 @@ command = ['tla', 'update'] c = ShellCommand(self.builder, command, d, sendRC=False, timeout=self.timeout) + self.command = c return c.start() def doVCFull(self): @@ -830,6 +849,7 @@ c = ShellCommand(self.builder, command, self.builder.basedir, sendRC=False, keepStdout=True, timeout=self.timeout) + self.command = c d = c.start() d.addCallback(self._abandonOnFailure) d.addCallback(self._didRegister, c) @@ -851,6 +871,7 @@ self.version, self.srcdir] c = ShellCommand(self.builder, command, self.builder.basedir, sendRC=False, timeout=self.timeout) + self.command = c d = c.start() d.addCallback(self._abandonOnFailure) if self.buildconfig: @@ -862,6 +883,7 @@ command = ['tla', 'build-config', self.buildconfig] c = ShellCommand(self.builder, command, d, sendRC=False, timeout=self.timeout) + self.command = c d = c.start() d.addCallback(self._abandonOnFailure) return d @@ -893,6 +915,7 @@ env = {'P4PORT': self.p4port} c = ShellCommand(self.builder, command, d, environ=env, sendRC=False, timeout=self.timeout) + self.command = c return c.start() def doVCFull(self): From warner at users.sourceforge.net Sat Dec 4 21:02:06 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 04 Dec 2004 21:02:06 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.320,1.321 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv14277 Modified Files: ChangeLog Log Message: * buildbot/slave/bot.py: clean up shutdown/lose-master code (SlaveBuilder): make some attributes class-level, remove the old "update queue" which existed to support resuming a build after the master connection was lost. Try to reimplement that feature later. (SlaveBuilder.stopCommand): clear self.command when the SlaveCommand finishes, so that we don't try to kill a leftover one at shutdown time. (SlaveBuilder.commandComplete): same, merge with commandFailed and .finishCommand * buildbot/slave/commands.py (SourceBase): set self.command for all VC commands, so they can be interrupted. Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.320 retrieving revision 1.321 diff -u -d -r1.320 -r1.321 --- ChangeLog 3 Dec 2004 22:54:50 -0000 1.320 +++ ChangeLog 4 Dec 2004 21:02:04 -0000 1.321 @@ -1,3 +1,18 @@ +2004-12-04 Brian Warner + + * buildbot/slave/bot.py: clean up shutdown/lose-master code + (SlaveBuilder): make some attributes class-level, remove the old + "update queue" which existed to support resuming a build after the + master connection was lost. Try to reimplement that feature later. + (SlaveBuilder.stopCommand): clear self.command when the + SlaveCommand finishes, so that we don't try to kill a leftover one + at shutdown time. + (SlaveBuilder.commandComplete): same, merge with commandFailed and + .finishCommand + + * buildbot/slave/commands.py (SourceBase): set self.command for + all VC commands, so they can be interrupted. + 2004-12-03 Brian Warner * buildbot/master.py: clean up slave-handling code, to handle From warner at users.sourceforge.net Sat Dec 4 21:12:22 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 04 Dec 2004 21:12:22 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.321,1.322 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv16232 Modified Files: ChangeLog Log Message: (Dispatcher.requestAvatar): remove debug message that broke PBChangeSource Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.321 retrieving revision 1.322 diff -u -d -r1.321 -r1.322 --- ChangeLog 4 Dec 2004 21:02:04 -0000 1.321 +++ ChangeLog 4 Dec 2004 21:12:20 -0000 1.322 @@ -1,5 +1,8 @@ 2004-12-04 Brian Warner + * buildbot/master.py (Dispatcher.requestAvatar): remove debug + message that broke PBChangeSource + * buildbot/slave/bot.py: clean up shutdown/lose-master code (SlaveBuilder): make some attributes class-level, remove the old "update queue" which existed to support resuming a build after the From warner at users.sourceforge.net Sat Dec 4 21:12:22 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 04 Dec 2004 21:12:22 +0000 Subject: [Buildbot-commits] buildbot/buildbot master.py,1.55,1.56 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv16232/buildbot Modified Files: master.py Log Message: (Dispatcher.requestAvatar): remove debug message that broke PBChangeSource Index: master.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/master.py,v retrieving revision 1.55 retrieving revision 1.56 diff -u -d -r1.55 -r1.56 --- master.py 3 Dec 2004 22:54:50 -0000 1.55 +++ master.py 4 Dec 2004 21:12:19 -0000 1.56 @@ -495,8 +495,6 @@ def requestAvatar(self, avatarID, mind, interface): assert interface == pb.IPerspective - log.msg("requestAvatar(%s) from %s" % \ - (avatarID, mind.broker.transport.getPeer())) afactory = self.names.get(avatarID) if afactory: p = afactory.getPerspective() From warner at users.sourceforge.net Sat Dec 4 21:18:26 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 04 Dec 2004 21:18:26 +0000 Subject: [Buildbot-commits] buildbot/buildbot/status words.py,1.29,1.30 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/status In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv17516/buildbot/status Modified Files: words.py Log Message: (IrcStatusBot.command_STOP): add a 'stop build' command to the IRC bot Index: words.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/status/words.py,v retrieving revision 1.29 retrieving revision 1.30 diff -u -d -r1.29 -r1.30 --- words.py 28 Nov 2004 05:20:23 -0000 1.29 +++ words.py 4 Dec 2004 21:18:24 -0000 1.30 @@ -256,6 +256,35 @@ d.addCallback(self.buildFinished, reply) command_FORCE.usage = "force build - Force a build" + def command_STOP(self, user, reply, args): + args = args.split(None, 2) + if len(args) < 3 or args[0] != 'build': + raise UsageError, "try 'stop build WHICH '" + which = args[1] + reason = args[2] + + buildercontrol = self.getControl(which) + + who = None + r = "stopped: by IRC user <%s>: %s" % (user, reason) + + # find an in-progress build + builderstatus = self.getBuilder(which) + buildstatus = builderstatus.getCurrentBuild() + if not buildstatus: + self.reply(reply, "sorry, no build is currently running") + return + num = buildstatus.getNumber() + + # obtain the BuildControl object + buildcontrol = buildercontrol.getBuild(num) + + # make it stop + bc.stopBuild(r) + + self.reply(reply, "build %d interrupted" % num) + command_STOP.usage = "stop build - Stop a running build" + def emit_status(self, reply, which): b = self.getBuilder(which) str = "%s: " % which From warner at users.sourceforge.net Sat Dec 4 21:18:26 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 04 Dec 2004 21:18:26 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.322,1.323 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv17516 Modified Files: ChangeLog Log Message: (IrcStatusBot.command_STOP): add a 'stop build' command to the IRC bot Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.322 retrieving revision 1.323 diff -u -d -r1.322 -r1.323 --- ChangeLog 4 Dec 2004 21:12:20 -0000 1.322 +++ ChangeLog 4 Dec 2004 21:18:24 -0000 1.323 @@ -1,5 +1,8 @@ 2004-12-04 Brian Warner + * buildbot/status/words.py (IrcStatusBot.command_STOP): add a + 'stop build' command to the IRC bot + * buildbot/master.py (Dispatcher.requestAvatar): remove debug message that broke PBChangeSource From warner at users.sourceforge.net Sat Dec 4 21:20:21 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 04 Dec 2004 21:20:21 +0000 Subject: [Buildbot-commits] buildbot/buildbot/status words.py,1.30,1.31 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/status In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv18016 Modified Files: words.py Log Message: fix stupid typo Index: words.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/status/words.py,v retrieving revision 1.30 retrieving revision 1.31 diff -u -d -r1.30 -r1.31 --- words.py 4 Dec 2004 21:18:24 -0000 1.30 +++ words.py 4 Dec 2004 21:20:18 -0000 1.31 @@ -280,7 +280,7 @@ buildcontrol = buildercontrol.getBuild(num) # make it stop - bc.stopBuild(r) + buildcontrol.stopBuild(r) self.reply(reply, "build %d interrupted" % num) command_STOP.usage = "stop build - Stop a running build" From warner at users.sourceforge.net Sat Dec 4 22:17:03 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 04 Dec 2004 22:17:03 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.323,1.324 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv29472 Modified Files: ChangeLog Log Message: update to current usage Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.323 retrieving revision 1.324 diff -u -d -r1.323 -r1.324 --- ChangeLog 4 Dec 2004 21:18:24 -0000 1.323 +++ ChangeLog 4 Dec 2004 22:17:00 -0000 1.324 @@ -1,5 +1,7 @@ 2004-12-04 Brian Warner + * docs/examples/twisted_master.cfg: update to current usage + * buildbot/status/words.py (IrcStatusBot.command_STOP): add a 'stop build' command to the IRC bot From warner at users.sourceforge.net Sat Dec 4 22:17:02 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 04 Dec 2004 22:17:02 +0000 Subject: [Buildbot-commits] buildbot/docs/examples twisted_master.cfg,1.25,1.26 Message-ID: Update of /cvsroot/buildbot/buildbot/docs/examples In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv29472/docs/examples Modified Files: twisted_master.cfg Log Message: update to current usage Index: twisted_master.cfg =================================================================== RCS file: /cvsroot/buildbot/buildbot/docs/examples/twisted_master.cfg,v retrieving revision 1.25 retrieving revision 1.26 diff -u -d -r1.25 -r1.26 --- twisted_master.cfg 7 Nov 2004 20:17:34 -0000 1.25 +++ twisted_master.cfg 4 Dec 2004 22:16:59 -0000 1.26 @@ -76,7 +76,7 @@ 'slavename': "bot1", 'builddir': "quick", 'factory': QuickTwistedBuildFactory(svnurl, - python="python2.2"), + python=["python2.2", "python2.3"]), } builders.append(b1) @@ -129,7 +129,7 @@ } builders.append(b3) -reactors = ['gtk2', 'gtk', 'poll'] +reactors = ['gtk2', 'gtk', 'qt', 'poll'] b4 = {'name': "reactors", 'slavename': "bot2", 'builddir': "reactors", @@ -162,7 +162,7 @@ builders.append(b22w32) b23bsd = {'name': "freebsd", - 'slavename': "bot-dialtone", + 'slavename': "bot-suszko", 'builddir': "bsd-full2.2", 'factory': TwistedReactorsBuildFactory(svnurl, python="python2.3", @@ -204,7 +204,8 @@ c['manhole'] = master.Manhole(*private.manhole) c['status'].append(client.PBListener(9936)) m = mail.MailNotifier(fromaddr="buildbot at twistedmatrix.com", - builders=["quick", "full-2.2", "full-2.3", "full-2.4"], + #builders=["quick", "full-2.2", "full-2.3", "full-2.4"], + builders=["quick", "full-2.3"], sendToInterestedUsers=True, extraRecipients=["warner at lothar.com"], mode="problem", From warner at users.sourceforge.net Sat Dec 4 22:30:25 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 04 Dec 2004 22:30:25 +0000 Subject: [Buildbot-commits] buildbot/buildbot/test test_slavecommand.py,1.6,1.7 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/test In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32089/buildbot/test Modified Files: test_slavecommand.py Log Message: use sys.executable instead of hard-coding 'python' for child commands, might help portability Index: test_slavecommand.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/test/test_slavecommand.py,v retrieving revision 1.6 retrieving revision 1.7 diff -u -d -r1.6 -r1.7 --- test_slavecommand.py 14 Oct 2004 17:28:34 -0000 1.6 +++ test_slavecommand.py 4 Dec 2004 22:30:23 -0000 1.7 @@ -11,7 +11,7 @@ import sys startLogging(sys.stdout) -import re, time +import re, time, sys import signal from buildbot.slave.commands import SlaveShellCommand @@ -115,7 +115,7 @@ self.assertEquals(got, expected) def testShell1(self): - cmd = "python emit.py 0" + cmd = sys.executable + " emit.py 0" args = {'command': cmd, 'workdir': '.', 'timeout': 5} failed = self.doTest(SlaveShellCommand, args) self.failIf(failed) @@ -124,7 +124,7 @@ self.checkrc(0) def testShell2(self): - cmd = "python emit.py 1" + cmd = sys.executable + " emit.py 1" args = {'command': cmd, 'workdir': '.', 'timeout': 5} failed = self.doTest(SlaveShellCommand, args) self.failIf(failed) @@ -133,7 +133,7 @@ self.checkrc(1) def testShell3(self): - cmd = "python emit.py 0" + cmd = sys.executable + " emit.py 0" args = {'command': cmd, 'workdir': '.', 'env': {'EMIT_TEST': "envtest"}, 'timeout': 5} failed = self.doTest(SlaveShellCommand, args) @@ -145,7 +145,7 @@ self.checkrc(0) def testShell4(self): - cmd = "python emit.py 0" + cmd = sys.executable + " emit.py 0" args = {'command': cmd, 'workdir': "subdir", 'timeout': 5} failed = self.doTest(SlaveShellCommand, args) self.failIf(failed) From warner at users.sourceforge.net Sat Dec 4 22:30:25 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 04 Dec 2004 22:30:25 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.324,1.325 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32089 Modified Files: ChangeLog Log Message: use sys.executable instead of hard-coding 'python' for child commands, might help portability Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.324 retrieving revision 1.325 diff -u -d -r1.324 -r1.325 --- ChangeLog 4 Dec 2004 22:17:00 -0000 1.324 +++ ChangeLog 4 Dec 2004 22:30:23 -0000 1.325 @@ -1,5 +1,8 @@ 2004-12-04 Brian Warner + * buildbot/test/test_slavecommand.py: use sys.executable instead + of hard-coding 'python' for child commands, might help portability + * docs/examples/twisted_master.cfg: update to current usage * buildbot/status/words.py (IrcStatusBot.command_STOP): add a From warner at users.sourceforge.net Mon Dec 6 01:23:15 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 01:23:15 +0000 Subject: [Buildbot-commits] buildbot/docs source.xhtml,1.3,1.4 Message-ID: Update of /cvsroot/buildbot/buildbot/docs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32477/docs Modified Files: source.xhtml Log Message: (Arch): correct terminology Index: source.xhtml =================================================================== RCS file: /cvsroot/buildbot/buildbot/docs/source.xhtml,v retrieving revision 1.3 retrieving revision 1.4 diff -u -d -r1.3 -r1.4 --- source.xhtml 8 Sep 2004 19:36:17 -0000 1.3 +++ source.xhtml 6 Dec 2004 01:23:12 -0000 1.4 @@ -147,7 +147,7 @@ timestamp.

    Arch specifies a repository by -URL, as well as a revision which is kind of like a branch name. Arch +URL, as well as a version which is kind of like a branch name. Arch uses the word archive to represent the repository. Arch lets you push changes from one archive to another, removing the strict centralization required by CVS and SVN. It seems to retain the distinction between From warner at users.sourceforge.net Mon Dec 6 01:23:15 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 01:23:15 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.325,1.326 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32477 Modified Files: ChangeLog Log Message: (Arch): correct terminology Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.325 retrieving revision 1.326 diff -u -d -r1.325 -r1.326 --- ChangeLog 4 Dec 2004 22:30:23 -0000 1.325 +++ ChangeLog 6 Dec 2004 01:23:13 -0000 1.326 @@ -1,3 +1,7 @@ +2004-12-05 Brian Warner + + * docs/source.xhtml (Arch): correct terminology + 2004-12-04 Brian Warner * buildbot/test/test_slavecommand.py: use sys.executable instead From warner at users.sourceforge.net Mon Dec 6 03:09:27 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 03:09:27 +0000 Subject: [Buildbot-commits] buildbot/docs slave.xhtml,NONE,1.1 Message-ID: Update of /cvsroot/buildbot/buildbot/docs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv22115/docs Added Files: slave.xhtml Log Message: provide a buildslave setup checklist --- NEW FILE: slave.xhtml --- Buildslave Configuration

    Buildslave Configuration

    This document describes the basics of setting up a buildslave.

    Typically, you will be adding a buildslave to an existing buildmaster, to provide additional architecture coverage. The buildbot administrator will give you several pieces of information necessary to connect to the buildmaster. You should also be somewhat familiar with the project being tested, so you can troubleshoot build problems locally.

    The buildbot exists to make sure that the project's stated how to build it process actually works. To this end, the buildslave should run in an environment just like that of your regular developers. Typically the project build process is documented somewhere (README, INSTALL, etc), in a document that should mention all library dependencies and contain a basic set of build instructions. This document will be useful as you configure the host and account in which the buildslave runs.

    Checklist

    You will need the following pieces before starting your buildslave for the first time:

    1. Set up the account: It is recommended (although not mandatory) to set up a separate user account for the buildslave. This account is frequently named 'buildbot'. This serves to isolate your personal working environment from that of the slave's, and helps to minimize the security threat posed by letting possibly-unknown contributors run arbitrary code on your system. The account should have a minimum of fancy init scripts.
    2. Set up the host: Make sure the host can actually reach the buildmaster. Usually the buildmaster is running a status webserver on the same machine, so simply point your web browser at it and see if you can get there. Install whatever additional packages or libraries the project's INSTALL document advises. (or not, if your buildslave is supposed to make sure that building without optional libraries still works, then don't install those libraries).
    3. Test the build process: Follow the instructions in the INSTALL document, in the buildbot account. Perform a full CVS (or whatever) checkout, configure, make, run tests, etc. Confirm that the build works without manual fussing. If it doesn't work when you do it by hand, it will be unlikely to work when the buildbot attempts to do it in an automated fashion.
    4. Choose a base directory: This should be somewhere in the buildbot account, typically named after the project which is being tested. The buildslave will not touch any file outside of this directory.
    5. Create the hostinfo files: When it first connects, the buildslave will send a few files up to the buildmaster which describe the host that it is running on. These files are presented on the web status display so that developers have more information to reproduce any test failures that are witnessed by the buildbot. Create a directory named 'info' (directly under the buildbot home directory would be a good place). Inside it, make a file named 'admin', and put your name/email in it. This is the buildslave admin address, and will be reachable from the build status page (so you may wish to munge it a bit if address-harvesting spambots are a concern). Create a second file named 'host' and fill it with a brief description of the host: OS, version, memory size, CPU speed, versions of relevant libraries installed, and finally the version of the buildbot code which is running the buildslave. You will create a symlink to this directory when you finally set up the buildslave.
    6. Get the buildmaster host/port, botname, and password: The buildmaster admin will give you a hostname:portno pair which specifies the TCP port on which the buildmaster is expecting connections from the buildslaves. They will also assign you a name and password which your bot will use.
    7. Install the buildbot code: Obtain the latest tarball from buildbot.sf.net, verify the signature, unpack it, then do the usual python ./setup.py build; sudo python ./setup.py install dance. If you do not have root on this host, you can install it into a different directory as long as you remember to add it to the PYTHONPATH environment variable at the right time. python ./setup.py install --home=~ and then PYTHONPATH=~/lib/python is a common technique.

    Configuring the buildslave

    With all that setup ready, you are ready to create the buildslave. There is a tool provided in the buildbot package named (simply enough) buildbot, which usually gets installed to /usr/bin/buildbot. This tool provides a front-end to a Twisted program named mktap. When you use buildbot to create the buildslave, it will create the base directory for you (and complain if it already exists). Take the BASEDIR you've picked, the HOST:PORT buildmaster location, and the BOTNAME and PASSWORD you've been assigned, and run buildbot as follows:

    buildbot slave BASEDIR HOST:PORT BOTNAME PASSWORD
    

    That will create and populate BASEDIR with some setup files. The buildbot.tap file contains a freeze-dried buildslave object, ready to be run by Twisted's daemon-launching utility twistd (pronounced twist-dee). That's it. In the future, buildbot slave will probably do more setup.

    Now symlink your hostinfo directory into place:

    cd BASEDIR
    ln -s ~/hostinfo ./info
    

    Your buildslave is now ready to run!

    Starting the buildslave

    To start the buildslave manually, just use the buildbot tool again:

    buildbot start BASEDIR
    

    This will start any freeze-dried application found in the given directory. (The same command is used to start a buildmaster instance). Note that buildbot start is really just a front end for twistd.

    As soon as the buildslave starts, you should find two new files in its base directory. The first is named twistd.pid, and simply contains the process ID of the buildslave's twistd process. You can use ps to find it in your process table. The second is twistd.log, and is the buildslave's log file. Everything the buildslave does is recorded in this file. It is the first place to look for error messages or exception traces.

    Once the buildslave connects to the buildmaster, new directories will start appearing in the base directory. The buildmaster tells the slave to create a directory for each builder which will be using that slave. Within these directories, CVS checkouts, compiles, and tests are performed.

    Making sure the buildslave starts at each reboot

    Before you are done, you need to make sure the buildslave will keep running even if the host reboots. The easiest way I'm found to do this is to add a @reboot crontab entry. Most modern versions of cron interpret a time specification of @reboot to indicate that the given job should be run the first time cron is started after system boot. Something like the following usually works:

    @reboot buildbot start BASEDIR
    

    It is important to remember that the environment provided to cron jobs can be quite different that your normal runtime. There may be fewer environment variables specified, and the PATH may be shorter than usual. It is a good idea to test out this method of launching the buildslave by using a time in the near future, with the same command, and then check twistd.log to make sure the slave actually started correctly.

    Shutting down the buildslave

    To stop the buildslave manually, use the buildbot tool again:

    buildbot stop BASEDIR
    

    This simply looks for the twistd.pid file and kills whatever process is identified within.

    At system shutdown, all processes are sent a SIGKILL. The buildslave will respond to this by shutting down normally.

    Troubleshooting

    Cron jobs are typically run with a minimal shell (/bin/sh, not /bin/bash), and tilde expansion is not always performed in such commands. You may want to use explicit paths, because the PATH is usually quite short and doesn't include anything set by your shell's startup scripts (.profile, .bashrc, etc). If you've installed buildbot (or other python libraries) to an unusual location, you may need to add a PYTHONPATH specification (note that python will do tilde-expansion on PYTHONPATH elements by itself).

    @reboot PYTHONPATH=~/lib/python /usr/local/bin/buildbot start /usr/home/buildbot/basedir
    

    Take the time to get the @reboot job set up. Otherwise, things will work fine for a while, but the first power outage or system reboot you have will stop the buildslave with nothing but the cries of sorrowful developers to remind you that it has gone away.

    From the buildmaster's main status web page, you can force a build to be run on your build slave. Figure out which column is for a builder that runs on your slave, click on that builder's name, and the page that comes up will have a Force Build button. Fill in the form, hit the button, and a moment later you should see your slave's twistd.log filling with commands being run. Using pstree or top should also reveal the cvs/make/gcc/etc processes being run by the buildslave. Note that the same web page should also show the admin and host information files that you configured earlier.

    From warner at users.sourceforge.net Mon Dec 6 03:09:27 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 03:09:27 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.326,1.327 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv22115 Modified Files: ChangeLog Log Message: provide a buildslave setup checklist Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.326 retrieving revision 1.327 diff -u -d -r1.326 -r1.327 --- ChangeLog 6 Dec 2004 01:23:13 -0000 1.326 +++ ChangeLog 6 Dec 2004 03:09:25 -0000 1.327 @@ -1,5 +1,7 @@ 2004-12-05 Brian Warner + * docs/slave.xhtml: provide a buildslave setup checklist + * docs/source.xhtml (Arch): correct terminology 2004-12-04 Brian Warner From warner at users.sourceforge.net Mon Dec 6 03:31:49 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 03:31:49 +0000 Subject: [Buildbot-commits] buildbot/docs slave.xhtml,1.1,1.2 Message-ID: Update of /cvsroot/buildbot/buildbot/docs In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26287 Modified Files: slave.xhtml Log Message: more notes Index: slave.xhtml =================================================================== RCS file: /cvsroot/buildbot/buildbot/docs/slave.xhtml,v retrieving revision 1.1 retrieving revision 1.2 diff -u -d -r1.1 -r1.2 --- slave.xhtml 6 Dec 2004 03:09:24 -0000 1.1 +++ slave.xhtml 6 Dec 2004 03:31:46 -0000 1.2 @@ -186,7 +186,21 @@ respond to this by shutting down normally.

    -

    Troubleshooting

    +

    Maintenance

    + +

    It is a good idea to check the buildmaster's status page every once in a +while, to see if your buildslave is still online. Eventually the buildbot +will probably be enhanced to send you email (via the info/admin email +address) when the slave has been offline for more than a few hours.

    + +

    If you find you can no longer provide a buildslave to the project, please +let the project admins know, so they can put out a call for a +replacement.

    + + +

    Troubleshooting

    + +

    Starting the buildslave

    Cron jobs are typically run with a minimal shell (/bin/sh, not /bin/bash), and tilde expansion is not always performed in such commands. You may want to @@ -205,6 +219,26 @@ stop the buildslave with nothing but the cries of sorrowful developers to remind you that it has gone away.

    + +

    Connecting to the buildmaster

    + +

    If the buildslave cannot connect to the buildmaster, the reason should be +described in the twistd.log logfile. Some common problems are an +incorrect master hostname or port number, or a mistyped bot name or password. +If the buildslave loses the connection to the master, it is supposed to +attempt to reconnect with an exponentially-increasing backoff. Each attempt +(and the time of the next attempt) will be logged. If you get impatient, just +manually stop and re-start the buildslave.

    + +

    When the buildmaster is restarted, all slaves will be disconnected, and +will attempt to reconnect as usual. The reconnect time will depend upon how +long the buildmaster is offline (i.e. how far up the exponential backoff +curve the slaves have travelled). Again, buildbot stop BASEDIR; +buildbot start BASEDIR will speed up the process.

    + + +

    Running builds

    +

    From the buildmaster's main status web page, you can force a build to be run on your build slave. Figure out which column is for a builder that runs on your slave, click on that builder's name, and the page that comes up will @@ -215,5 +249,4 @@ the same web page should also show the admin and host information files that you configured earlier.

    - From warner at users.sourceforge.net Mon Dec 6 07:00:44 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 07:00:44 +0000 Subject: [Buildbot-commits] site robots.txt,NONE,1.1 source-Arch.html,NONE,1.1 index.html,1.31,1.32 Message-ID: Update of /cvsroot/buildbot/site In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32309 Modified Files: index.html Added Files: robots.txt source-Arch.html Log Message: added Arch repository pointer, robots.txt Index: index.html =================================================================== RCS file: /cvsroot/buildbot/site/index.html,v retrieving revision 1.31 retrieving revision 1.32 diff -u -d -r1.31 -r1.32 --- index.html 25 Nov 2004 00:05:00 -0000 1.31 +++ index.html 6 Dec 2004 07:00:41 -0000 1.32 @@ -16,7 +16,13 @@ href="http://sourceforge.net/project/showfiles.php?group_id=73177">here. The release is signed with my GPG public key, available here. - + +
  1. The latest code is available from CVS for browsing + or read-only + checkout. There is also an Arch repository which tracks the main CVS + tree, details are here.
  2. +
  3. The README file: installation hints, overview
  4. Recent changes are summarized in the NEWS file, @@ -108,5 +114,5 @@ align="right"> -Last modified: Wed Nov 24 16:04:37 PST 2004 +Last modified: Sun Dec 5 22:29:48 PST 2004 --- NEW FILE: robots.txt --- # robots.txt for http://buildbot.sf.net/ User-agent: * Crawl-Delay: 30 Disallow: /Arch --- NEW FILE: source-Arch.html --- Buildbot Arch archive

    Buildbot source code via Arch

    In addition to CVS, the Buildbot source code is available via Arch. The archive is experimental, but I figure this may make it slightly easier for contributors to track the upstream sources. More significantly, the Arch archive is synchronized from the CVS repository within a few minutes after each commit, as opposed to the 5-plus-hour delay between a commit and the time the sf.net anonymous CVS repository gets updated.

    The result is that if you wanted to (say) set up a metabuildbot (a Buildbot which runs the Buildbot unit test suite), and you wanted it to be able to test code that was a few minutes old instead of several hours old, you'd want to configure it to get its source code from the Arch archive instead of from anoncvs.

    The Arch coordinates are as follows:

    • Name: arch at buildbot.sf.net--2004
    • Location: http://buildbot.sourceforge.net/Arch/2004/
    • Version: buildbot--dev--0

    Therefore, the following commands will get you an up-to-date Buildbot tree:

    tla register-archive http://buildbot.sourceforge.net/Arch/2004/
    tla get arch at buildbot.sf.net--2004/buildbot--dev--0
    

    All changes are signed with GPG key 0x12FF2BDA, provided below. To verify the signatures on each revision as you access them, import the key, and put something like the following in your ~/.arch-params/signing/arch at buildbot.sf.net--2004.check, or see the Arch wiki for other possibilities:

    tla-gpg-check gpg_command="LC_MESSAGES=C gpg --verify-files -q --no-show-notation --batch --no-tty -" 2>&1 | grep "^gpg: Good signature from" 1>&2
    
    pub  1024D/12FF2BDA 2004-12-05 Arch Signing Key (automated) <arch at buildbot.sf.net>
    sub  1024g/9D779314 2004-12-05 [expires: 2009-12-04]
    
    -----BEGIN PGP PUBLIC KEY BLOCK-----
    Version: GnuPG v1.2.5 (GNU/Linux)
    
    mQGiBEGzfJERBAChqsnvXWFkx0PdfdD8pfuriKy0QxDaRW63KW+BC9P/AVkVW5h+
    j4KKoBlF45V+nWEyu+PkNlDYscn94kodvytA1Ah1x1ZVR2SeqLEoQSgZV9WIfhBm
    2wxHkCpSnfhxgr/a/YYeSBHIqqyl6ONU6CuXcrMNt1qnHe2k5NgfRob/UwCgghKe
    gRbhCXOCirBb8EG10iohRjcEAJWl5ZWYHzIuqYv9FnLj/68onZhWZw3Rfldi7EMY
    GJgNanhTs4o3I7S9qL5UXWl1VD1ypkPrLQwLz1sY7DqyEmEyJZeYDfUoHSXAq6in
    6o/TgNkOo9BJNThpY2sEhXBsP7pEIb3m8b+JNt0tFaEdBFbYBxgWJqCu2Uyyl60X
    ZLDiA/0VklvxyMtQSb5JHUD3JZ20m9/ejUUEizGpm7FbZD/+oTqI1WzzpJNzaUy7
    74FHbhzx7Fgm3gZhCZRAvTLPnJuiiAZH4CwO0oHPOBd3PbpnQz34mqcVwn/ogXaT
    EwT9mtSB7pcRyoOtopaOt9NKcGcFzi1gMvnHQsC4kUWi0dfnqLQzQXJjaCBTaWdu
    aW5nIEtleSAoYXV0b21hdGVkKSA8YXJjaEBidWlsZGJvdC5zZi5uZXQ+iGQEExEC
    ACQFAkGzfJECGwMFCQlmAYAGCwkIBwMCAxUCAwMWAgECHgECF4AACgkQYoUuNBL/
    K9oCeACeN5CEGXWi67yLRVA6GmA8ofHhlx8AnjQUXM82HHuPScO2/Ta4GWlyNq/Z
    iEwEExECAAwFAkGz/8gFgwllfkkACgkQtGNUshUUp73g2ACeJCYRwYrpn6Fx5/Q/
    Pvm/5BaN6xAAoIYLv4PwaBHuaHi5q32b4RFoDq8RuQENBEGzfJIQBADCxhNV9ZI+
    d6EkITC3CFdFN7Q7s5YKCxYEhhgnWb4LrBiYG4iCpZptgakHzc4cf+xDGZbz9foN
    lnMfS1UDgtCB3DFYSLS0Z1h2GfWPaUQH/EUqlZ2KcSCAq+loU+2PN4E6Kh05AuoD
    yIrUtzwadV2upm+5BAf8j3ErGwrXeM6IwwADBQP+OKXD9pRidJgFuEMbioljHl8S
    Dxl1L7Q6OFtCMl7/AjSx1kKHbn44n1jSIUQYZ9lbUibdK4BHfZI6EHm7lU6raaWR
    JDbuGSTVTwakjPCHbPA3D2EYYGSIOODs2Y2/YrUHagWjDEJGfpczRquhY8Ysspfp
    dV0r5DUEUnyVzrvJmyOITwQYEQIADwUCQbN8kgIbDAUJCWYBgAAKCRBihS40Ev8r
    2tl9AJ4wYrucqksA+DYIF6d+oNT+3WhF0gCfY2YXan8ut+X58k/pfYc5zVAh+4s=
    =jDx5
    -----END PGP PUBLIC KEY BLOCK-----
    

    Please note that this is an automated signing key, with slightly less security than my master key (0x1514A7BD) which is used to sign buildbot releases. It is signed by my master key.

    Also note that this is a two-way sync, and that when I sometimes do Buildbot work from a local Arch repository, those changes can get pushed into CVS via the same gateway. Still experimental, but it means that if you use Arch and want to contribute, I'll be able to pull from your archive instead of having you mail me patches all the time.


    Brian Warner <warner@lothar.com>
    Last modified: Sun Dec 5 22:58:54 PST 2004 From warner at users.sourceforge.net Mon Dec 6 07:04:49 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 07:04:49 +0000 Subject: [Buildbot-commits] site source-Arch.html,1.1,1.2 Message-ID: Update of /cvsroot/buildbot/site In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv527 Modified Files: source-Arch.html Log Message: minor edits Index: source-Arch.html =================================================================== RCS file: /cvsroot/buildbot/site/source-Arch.html,v retrieving revision 1.1 retrieving revision 1.2 diff -u -d -r1.1 -r1.2 --- source-Arch.html 6 Dec 2004 07:00:41 -0000 1.1 +++ source-Arch.html 6 Dec 2004 07:04:47 -0000 1.2 @@ -38,17 +38,25 @@ tla get arch at buildbot.sf.net--2004/buildbot--dev--0 -

    All changes are signed with GPG key 0x12FF2BDA, provided below. To verify -the signatures on each revision as you access them, import the key, and put -something like the following in your -~/.arch-params/signing/arch at buildbot.sf.net--2004.check, or see -the Arch -wiki for other possibilities:

    +

    All changes are signed with a dedicated GPG key 0x12FF2BDA, +provided below. To verify the signatures on each revision as you access them, +import the key, and put something like the following in your +~/.arch-params/signing/arch at buildbot.sf.net--2004.check (this +uses the tla-gpg-check utility from the debian tla +package), or see the Arch wiki for +other possibilities:

     tla-gpg-check gpg_command="LC_MESSAGES=C gpg --verify-files -q --no-show-notation --batch --no-tty -" 2>&1 | grep "^gpg: Good signature from" 1>&2
     
    + +

    Please note that this is an automated signing key, with slightly less +security than my master +key (0x1514A7BD) which is used to sign buildbot releases. It is signed by +my master key.

    +
     pub  1024D/12FF2BDA 2004-12-05 Arch Signing Key (automated) <arch at buildbot.sf.net>
     sub  1024g/9D779314 2004-12-05 [expires: 2009-12-04]
    @@ -81,11 +89,6 @@
     -----END PGP PUBLIC KEY BLOCK-----
     
    -

    Please note that this is an automated signing key, with slightly less -security than my master -key (0x1514A7BD) which is used to sign buildbot releases. It is signed by -my master key.

    -

    Also note that this is a two-way sync, and that when I sometimes do Buildbot work from a local Arch repository, those changes can get pushed into CVS via the same gateway. Still experimental, but it means that if you use @@ -98,5 +101,5 @@ Brian Warner <warner@lothar.com> -Last modified: Sun Dec 5 22:58:54 PST 2004 +Last modified: Sun Dec 5 23:04:38 PST 2004 From warner at users.sourceforge.net Mon Dec 6 07:07:53 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 07:07:53 +0000 Subject: [Buildbot-commits] site index.html,1.32,1.33 source-Arch.html,1.2,1.3 Message-ID: Update of /cvsroot/buildbot/site In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv982 Modified Files: index.html source-Arch.html Log Message: more fixes Index: index.html =================================================================== RCS file: /cvsroot/buildbot/site/index.html,v retrieving revision 1.32 retrieving revision 1.33 diff -u -d -r1.32 -r1.33 --- index.html 6 Dec 2004 07:00:41 -0000 1.32 +++ index.html 6 Dec 2004 07:07:51 -0000 1.33 @@ -105,14 +105,14 @@ src="http://www.lothar.com/castle-icon.gif" /> Brian Warner - <warner @ lothar.com> + <warner @ lothar.com> - - +SourceForge.net Logo - + align="right" /> + -Last modified: Sun Dec 5 22:29:48 PST 2004 +Last modified: Sun Dec 5 23:07:10 PST 2004 Index: source-Arch.html =================================================================== RCS file: /cvsroot/buildbot/site/source-Arch.html,v retrieving revision 1.2 retrieving revision 1.3 diff -u -d -r1.2 -r1.3 --- source-Arch.html 6 Dec 2004 07:04:47 -0000 1.2 +++ source-Arch.html 6 Dec 2004 07:07:51 -0000 1.3 @@ -96,10 +96,16 @@ of having you mail me patches all the time.

    -
    +
    + +
    - - - Brian Warner <warner@lothar.com>
    -Last modified: Sun Dec 5 23:04:38 PST 2004 + + + + Brian Warner + <warner @ lothar.com> + +Last modified: Sun Dec 5 23:07:27 PST 2004 From warner at users.sourceforge.net Mon Dec 6 07:21:53 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 07:21:53 +0000 Subject: [Buildbot-commits] buildbot/buildbot/status html.py,1.47,1.48 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/status In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3248/buildbot/status Modified Files: html.py Log Message: (WaterfallStatusResource.phase2): Add the date to the top-most box, if it is not the same as today's date. Index: html.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/status/html.py,v retrieving revision 1.47 retrieving revision 1.48 diff -u -d -r1.47 -r1.48 --- html.py 3 Dec 2004 22:54:52 -0000 1.47 +++ html.py 6 Dec 2004 07:21:51 -0000 1.48 @@ -1119,9 +1119,8 @@ # grid is a list of columns, one for the timestamps, and one per # event source. Each column is exactly the same height. Each element # of the list is a single box. - if timestamps: - lastDate = time.strftime("%d %b %Y", - time.localtime(timestamps[0])) + lastDate = time.strftime("%d %b %Y", + time.localtime(util.now())) for r in range(0, len(timestamps)): chunkstrip = eventGrid[r] # chunkstrip is a horizontal strip of event blocks. Each block @@ -1135,7 +1134,8 @@ else: # timestamp goes at the bottom of the chunk stuff = [] - # add the date each time it changes + # add the date at the beginning (if it is not the same as + # today's date), and each time it changes todayday = time.strftime("%a", time.localtime(timestamps[r])) today = time.strftime("%d %b %Y", From warner at users.sourceforge.net Mon Dec 6 07:21:54 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 07:21:54 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.327,1.328 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3248 Modified Files: ChangeLog Log Message: (WaterfallStatusResource.phase2): Add the date to the top-most box, if it is not the same as today's date. Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.327 retrieving revision 1.328 diff -u -d -r1.327 -r1.328 --- ChangeLog 6 Dec 2004 03:09:25 -0000 1.327 +++ ChangeLog 6 Dec 2004 07:21:51 -0000 1.328 @@ -1,5 +1,9 @@ 2004-12-05 Brian Warner + * buildbot/status/html.py (WaterfallStatusResource.phase2): Add + the date to the top-most box, if it is not the same as today's + date. + * docs/slave.xhtml: provide a buildslave setup checklist * docs/source.xhtml (Arch): correct terminology From warner at users.sourceforge.net Mon Dec 6 07:36:35 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 07:36:35 +0000 Subject: [Buildbot-commits] buildbot/buildbot/process builder.py,1.19,1.20 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/process In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5706/buildbot/process Modified Files: builder.py Log Message: * buildbot/master.py (DebugPerspective.attached): return 'self', to match the maybeDeferred change in Dispatcher.requestAvatar * buildbot/changes/pb.py (ChangePerspective.attached): same * buildbot/status/client.py (StatusClientPerspective.attached): same * buildbot/process/builder.py (Builder._attached3): same * buildbot/pbutil.py (NewCredPerspective.attached): same Index: builder.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/process/builder.py,v retrieving revision 1.19 retrieving revision 1.20 diff -u -d -r1.19 -r1.20 --- builder.py 3 Dec 2004 22:54:51 -0000 1.19 +++ builder.py 6 Dec 2004 07:36:33 -0000 1.20 @@ -163,6 +163,7 @@ for w in self.watchers['attach']: w.callback(self) self.watchers['attach'] = [] + return self def getSlaveCommandVersion(self, command, oldversion=None): if self.remoteCommands is None: From warner at users.sourceforge.net Mon Dec 6 07:36:35 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 07:36:35 +0000 Subject: [Buildbot-commits] buildbot/buildbot/status client.py,1.13,1.14 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/status In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5706/buildbot/status Modified Files: client.py Log Message: * buildbot/master.py (DebugPerspective.attached): return 'self', to match the maybeDeferred change in Dispatcher.requestAvatar * buildbot/changes/pb.py (ChangePerspective.attached): same * buildbot/status/client.py (StatusClientPerspective.attached): same * buildbot/process/builder.py (Builder._attached3): same * buildbot/pbutil.py (NewCredPerspective.attached): same Index: client.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/status/client.py,v retrieving revision 1.13 retrieving revision 1.14 diff -u -d -r1.13 -r1.14 --- client.py 23 Sep 2004 19:31:32 -0000 1.13 +++ client.py 6 Dec 2004 07:36:33 -0000 1.14 @@ -227,7 +227,7 @@ def attached(self, mind): #log.msg("StatusClientPerspective.attached") - pass + return self def detached(self, mind): log.msg("PB client detached") From warner at users.sourceforge.net Mon Dec 6 07:36:37 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 07:36:37 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.328,1.329 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5706 Modified Files: ChangeLog Log Message: * buildbot/master.py (DebugPerspective.attached): return 'self', to match the maybeDeferred change in Dispatcher.requestAvatar * buildbot/changes/pb.py (ChangePerspective.attached): same * buildbot/status/client.py (StatusClientPerspective.attached): same * buildbot/process/builder.py (Builder._attached3): same * buildbot/pbutil.py (NewCredPerspective.attached): same Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.328 retrieving revision 1.329 diff -u -d -r1.328 -r1.329 --- ChangeLog 6 Dec 2004 07:21:51 -0000 1.328 +++ ChangeLog 6 Dec 2004 07:36:34 -0000 1.329 @@ -1,5 +1,12 @@ 2004-12-05 Brian Warner + * buildbot/master.py (DebugPerspective.attached): return 'self', to + match the maybeDeferred change in Dispatcher.requestAvatar + * buildbot/changes/pb.py (ChangePerspective.attached): same + * buildbot/status/client.py (StatusClientPerspective.attached): same + * buildbot/process/builder.py (Builder._attached3): same + * buildbot/pbutil.py (NewCredPerspective.attached): same + * buildbot/status/html.py (WaterfallStatusResource.phase2): Add the date to the top-most box, if it is not the same as today's date. From warner at users.sourceforge.net Mon Dec 6 07:36:36 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 07:36:36 +0000 Subject: [Buildbot-commits] buildbot/buildbot/changes pb.py,1.4,1.5 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/changes In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5706/buildbot/changes Modified Files: pb.py Log Message: * buildbot/master.py (DebugPerspective.attached): return 'self', to match the maybeDeferred change in Dispatcher.requestAvatar * buildbot/changes/pb.py (ChangePerspective.attached): same * buildbot/status/client.py (StatusClientPerspective.attached): same * buildbot/process/builder.py (Builder._attached3): same * buildbot/pbutil.py (NewCredPerspective.attached): same Index: pb.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/changes/pb.py,v retrieving revision 1.4 retrieving revision 1.5 diff -u -d -r1.4 -r1.5 --- pb.py 8 Nov 2004 19:38:09 -0000 1.4 +++ pb.py 6 Dec 2004 07:36:33 -0000 1.5 @@ -19,7 +19,7 @@ self.sep = sep def attached(self, mind): - pass + return self def detached(self, mind): pass def perspective_addChange(self, changedict): From warner at users.sourceforge.net Mon Dec 6 07:36:36 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 07:36:36 +0000 Subject: [Buildbot-commits] buildbot/buildbot pbutil.py,1.7,1.8 master.py,1.56,1.57 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5706/buildbot Modified Files: pbutil.py master.py Log Message: * buildbot/master.py (DebugPerspective.attached): return 'self', to match the maybeDeferred change in Dispatcher.requestAvatar * buildbot/changes/pb.py (ChangePerspective.attached): same * buildbot/status/client.py (StatusClientPerspective.attached): same * buildbot/process/builder.py (Builder._attached3): same * buildbot/pbutil.py (NewCredPerspective.attached): same Index: master.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/master.py,v retrieving revision 1.56 retrieving revision 1.57 diff -u -d -r1.56 -r1.57 --- master.py 4 Dec 2004 21:12:19 -0000 1.56 +++ master.py 6 Dec 2004 07:36:34 -0000 1.57 @@ -436,7 +436,7 @@ class DebugPerspective(NewCredPerspective): def attached(self, mind): - pass + return self def detached(self, mind): pass Index: pbutil.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/pbutil.py,v retrieving revision 1.7 retrieving revision 1.8 diff -u -d -r1.7 -r1.8 --- pbutil.py 5 Dec 2003 20:38:51 -0000 1.7 +++ pbutil.py 6 Dec 2004 07:36:34 -0000 1.8 @@ -10,7 +10,7 @@ class NewCredPerspective(pb.Avatar): def attached(self, mind): - pass + return self def detached(self, mind): pass From warner at users.sourceforge.net Mon Dec 6 08:09:32 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 08:09:32 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.329,1.330 NEWS,1.33,1.34 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv11588 Modified Files: ChangeLog NEWS Log Message: Revision: arch at buildbot.sf.net--2004/buildbot--dev--0--patch-10 Creator: Brian Warner NEWS: update for stuff since last release 2004-12-05 Brian Warner * NEWS: update for stuff since last release Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.329 retrieving revision 1.330 diff -u -d -r1.329 -r1.330 --- ChangeLog 6 Dec 2004 07:36:34 -0000 1.329 +++ ChangeLog 6 Dec 2004 08:09:29 -0000 1.330 @@ -1,5 +1,7 @@ 2004-12-05 Brian Warner + * NEWS: update for stuff since last release + * buildbot/master.py (DebugPerspective.attached): return 'self', to match the maybeDeferred change in Dispatcher.requestAvatar * buildbot/changes/pb.py (ChangePerspective.attached): same Index: NEWS =================================================================== RCS file: /cvsroot/buildbot/buildbot/NEWS,v retrieving revision 1.33 retrieving revision 1.34 diff -u -d -r1.33 -r1.34 --- NEWS 23 Nov 2004 11:18:40 -0000 1.33 +++ NEWS 6 Dec 2004 08:09:30 -0000 1.34 @@ -1,5 +1,53 @@ User visible changes in Buildbot. +* Release ? (?) + +** new features + +It is now possible to interrupt a running build. Both the web page and the +IRC bot feature 'stop build' commands, which can be used to interrupt the +current BuildStep and accelerate the termination of the overall Build. The +status reporting for these still leaves something to be desired (an +'interrupt' event is pushed into the column, and the reason for the interrupt +is added to a pseudo-logfile for the step that was stopped, but if you only +look at the top-level status it appears that the build failed on its own). + +Builds are also halted if the connection to the buildmaster is lost. + +** minor new features + +The IRC log bot now reports ETA times in a MMSS format like "2m45s" instead +of the clunky "165 seconds". + +** bug fixes + +*** Slave Disconnect + +Slave disconnects should be handled better now: the current build should be +abandoned properly. Earlier versions could get into weird states where the +build failed to finish, clogging the builder forever (or at least until the +buildmaster was restarted). + +In addition, there are weird network conditions which could cause a +buildslave to attempt to connect twice to the same buildmaster. This can +happen when the slave is sending large logfiles over a slow link, while using +short keepalive timeouts. The buildmaster has been fixed to allow the second +connection attempt to take precedence over the first, so that the older +connection is jettisoned to make way for the newer one. This is half of the +fix, the other will involve fixing the slave to avoid the double-connect +situation in the first place. + +*** Large Logfiles + +The web page rendering code has been fixed to deliver large logfiles in +pieces, using a producer/consumer apparatus. This avoids the large spike in +memory consumption when the log file body was linearized into a single string +and then buffered in the socket's application-side transmit buffer. This +should also avoid the 640k single-string limit for web.distrib servers that +could be hit by large (>640k) logfiles. + + + * Release 0.6.1 (23 Nov 2004) ** win32 improvements/bugfixes From warner at users.sourceforge.net Mon Dec 6 08:47:43 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 08:47:43 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.330,1.331 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv19077 Modified Files: ChangeLog Log Message: update for previous (0.6.1) release. Obviously this needs to be handled better. Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.330 retrieving revision 1.331 diff -u -d -r1.330 -r1.331 --- ChangeLog 6 Dec 2004 08:09:29 -0000 1.330 +++ ChangeLog 6 Dec 2004 08:47:41 -0000 1.331 @@ -1,3 +1,8 @@ +2004-12-06 Brian Warner + + * debian/changelog: update for previous (0.6.1) release. Obviously + this needs to be handled better. + 2004-12-05 Brian Warner * NEWS: update for stuff since last release From warner at users.sourceforge.net Mon Dec 6 08:47:43 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 08:47:43 +0000 Subject: [Buildbot-commits] buildbot/debian changelog,1.2,1.3 Message-ID: Update of /cvsroot/buildbot/buildbot/debian In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv19077/debian Modified Files: changelog Log Message: update for previous (0.6.1) release. Obviously this needs to be handled better. Index: changelog =================================================================== RCS file: /cvsroot/buildbot/buildbot/debian/changelog,v retrieving revision 1.2 retrieving revision 1.3 diff -u -d -r1.2 -r1.3 --- changelog 1 Oct 2004 02:39:20 -0000 1.2 +++ changelog 6 Dec 2004 08:47:40 -0000 1.3 @@ -1,3 +1,9 @@ +buildbot (0.6.1-1) unstable; urgency=low + + * New upstream release + + -- Brian Warner Mon, 6 Dec 2004 00:46:39 -0800 + buildbot (0.6.0-1) unstable; urgency=low * New upstream release From warner at users.sourceforge.net Mon Dec 6 09:00:34 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 09:00:34 +0000 Subject: [Buildbot-commits] buildbot/buildbot/slave commands.py,1.18,1.19 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/slave In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21398/buildbot/slave Modified Files: commands.py Log Message: Revision: arch at buildbot.sf.net--2004/buildbot--dev--0--patch-14 Creator: Brian Warner Arch.doVCUpdate: use 'tla replay' instead of 'tla update' 2004-12-06 Brian Warner * buildbot/slave/commands.py (Arch.doVCUpdate): use 'tla replay' instead of 'tla update', which is more efficient in case we've missed a couple of patches since the last update. Index: commands.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/commands.py,v retrieving revision 1.18 retrieving revision 1.19 diff -u -d -r1.18 -r1.19 --- commands.py 4 Dec 2004 21:02:03 -0000 1.18 +++ commands.py 6 Dec 2004 09:00:32 -0000 1.19 @@ -827,7 +827,7 @@ def doVCUpdate(self): # update: possible for mode in ('copy', 'update') d = os.path.join(self.builder.basedir, self.srcdir) - command = ['tla', 'update'] + command = ['tla', 'replay'] c = ShellCommand(self.builder, command, d, sendRC=False, timeout=self.timeout) self.command = c From warner at users.sourceforge.net Mon Dec 6 09:00:34 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 06 Dec 2004 09:00:34 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.331,1.332 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv21398 Modified Files: ChangeLog Log Message: Revision: arch at buildbot.sf.net--2004/buildbot--dev--0--patch-14 Creator: Brian Warner Arch.doVCUpdate: use 'tla replay' instead of 'tla update' 2004-12-06 Brian Warner * buildbot/slave/commands.py (Arch.doVCUpdate): use 'tla replay' instead of 'tla update', which is more efficient in case we've missed a couple of patches since the last update. Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.331 retrieving revision 1.332 diff -u -d -r1.331 -r1.332 --- ChangeLog 6 Dec 2004 08:47:41 -0000 1.331 +++ ChangeLog 6 Dec 2004 09:00:30 -0000 1.332 @@ -1,5 +1,9 @@ 2004-12-06 Brian Warner + * buildbot/slave/commands.py (Arch.doVCUpdate): use 'tla replay' + instead of 'tla update', which is more efficient in case we've + missed a couple of patches since the last update. + * debian/changelog: update for previous (0.6.1) release. Obviously this needs to be handled better. From warner at users.sourceforge.net Wed Dec 8 03:54:58 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Wed, 08 Dec 2004 03:54:58 +0000 Subject: [Buildbot-commits] buildbot/buildbot/process step.py,1.57,1.58 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/process In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv1514/buildbot/process Modified Files: step.py Log Message: * buildbot/test/test_run.py (Disconnect.testBuild4): validate that losing the slave in the middle of a remote step is handled too * buildbot/process/step.py (ShellCommand.interrupt): 'reason' can be a Failure, so be sure to stringify it before using it as the contents of the 'interrupt' logfile (RemoteCommand.interrupt): use stringified 'why' in remote_interruptCommand too, just in case Index: step.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/process/step.py,v retrieving revision 1.57 retrieving revision 1.58 diff -u -d -r1.57 -r1.58 --- step.py 3 Dec 2004 22:54:51 -0000 1.57 +++ step.py 8 Dec 2004 03:54:56 -0000 1.58 @@ -90,8 +90,8 @@ # tell the remote command to halt. Returns a Deferred that will fire # when the interrupt command has been delivered. - d = defer.maybeDeferred(self.remote.callRemote, - "interruptCommand", self.commandID, why) + d = defer.maybeDeferred(self.remote.callRemote, "interruptCommand", + self.commandID, str(why)) return d def remote_update(self, updates): @@ -577,8 +577,9 @@ def interrupt(self, reason): # TODO: consider adding an INTERRUPTED or STOPPED status to use - # instead of FAILURE, might make the text a bit more clear - self.addCompleteLog('interrupt', reason) + # instead of FAILURE, might make the text a bit more clear. + # 'reason' can be a Failure, or text + self.addCompleteLog('interrupt', str(reason)) d = self.cmd.interrupt(reason) return d @@ -1184,6 +1185,7 @@ @param results: None """ + haltOnFailure = True name = "remote dummy" def __init__(self, timeout=5, **kwargs): From warner at users.sourceforge.net Wed Dec 8 03:54:58 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Wed, 08 Dec 2004 03:54:58 +0000 Subject: [Buildbot-commits] buildbot/buildbot/test test_run.py,1.19,1.20 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/test In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv1514/buildbot/test Modified Files: test_run.py Log Message: * buildbot/test/test_run.py (Disconnect.testBuild4): validate that losing the slave in the middle of a remote step is handled too * buildbot/process/step.py (ShellCommand.interrupt): 'reason' can be a Failure, so be sure to stringify it before using it as the contents of the 'interrupt' logfile (RemoteCommand.interrupt): use stringified 'why' in remote_interruptCommand too, just in case Index: test_run.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/test/test_run.py,v retrieving revision 1.19 retrieving revision 1.20 diff -u -d -r1.19 -r1.20 --- test_run.py 3 Dec 2004 22:54:53 -0000 1.19 +++ test_run.py 8 Dec 2004 03:54:56 -0000 1.20 @@ -374,6 +374,16 @@ self.failUnlessEqual(bs.getResults(), builder.FAILURE) + def verifyDisconnect2(self, bs): + self.failUnless(bs.isFinished()) + + step1 = bs.getSteps()[1] + self.failUnlessEqual(step1.getText(), ["remote", "delay", "2 secs", + "failed", "slave", "lost"]) + self.failUnlessEqual(step1.getResults()[0], builder.FAILURE) + + self.failUnlessEqual(bs.getResults(), builder.FAILURE) + def testIdle1(self): m,s,c,s1 = self.disconnectSetup() @@ -450,6 +460,19 @@ self.failUnlessEqual(s1.getState()[0], "offline") self.verifyDisconnect(bs) + def testBuild4(self): + m,s,c,s1 = self.disconnectSetup() + # this next sequence is timing-dependent + bc = c.getBuilder("dummy").forceBuild(None, "forced build") + bs = bc.getStatus() + # kill the slave while it's running the second (remote) step + reactor.callLater(1.5, self.killSlave) + + dr(bs.waitUntilFinished(), 5) + + self.failUnlessEqual(s1.getState()[0], "offline") + self.verifyDisconnect2(bs) + def testInterrupt(self): m,s,c,s1 = self.disconnectSetup() # this next sequence is timing-dependent From warner at users.sourceforge.net Wed Dec 8 03:54:59 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Wed, 08 Dec 2004 03:54:59 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.332,1.333 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv1514 Modified Files: ChangeLog Log Message: * buildbot/test/test_run.py (Disconnect.testBuild4): validate that losing the slave in the middle of a remote step is handled too * buildbot/process/step.py (ShellCommand.interrupt): 'reason' can be a Failure, so be sure to stringify it before using it as the contents of the 'interrupt' logfile (RemoteCommand.interrupt): use stringified 'why' in remote_interruptCommand too, just in case Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.332 retrieving revision 1.333 diff -u -d -r1.332 -r1.333 --- ChangeLog 6 Dec 2004 09:00:30 -0000 1.332 +++ ChangeLog 8 Dec 2004 03:54:56 -0000 1.333 @@ -1,3 +1,14 @@ +2004-12-07 Brian Warner + + * buildbot/test/test_run.py (Disconnect.testBuild4): validate that + losing the slave in the middle of a remote step is handled too + + * buildbot/process/step.py (ShellCommand.interrupt): 'reason' can + be a Failure, so be sure to stringify it before using it as the + contents of the 'interrupt' logfile + (RemoteCommand.interrupt): use stringified 'why' in + remote_interruptCommand too, just in case + 2004-12-06 Brian Warner * buildbot/slave/commands.py (Arch.doVCUpdate): use 'tla replay' From warner at users.sourceforge.net Wed Dec 8 04:15:26 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Wed, 08 Dec 2004 04:15:26 +0000 Subject: [Buildbot-commits] buildbot/buildbot/status words.py,1.31,1.32 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/status In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5309/buildbot/status Modified Files: words.py Log Message: (IrcStatusBot.getBuilder): catch the KeyError that happens when you ask for a non-existent Builder, and translate it into a UsageError. Index: words.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/status/words.py,v retrieving revision 1.31 retrieving revision 1.32 diff -u -d -r1.31 -r1.32 --- words.py 4 Dec 2004 21:20:18 -0000 1.31 +++ words.py 8 Dec 2004 04:15:23 -0000 1.32 @@ -103,8 +103,9 @@ return meth def getBuilder(self, which): - b = self.status.getBuilder(which) - if not b: + try: + b = self.status.getBuilder(which) + except KeyError: raise UsageError, "no such builder '%s'" % which return b From warner at users.sourceforge.net Wed Dec 8 04:15:27 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Wed, 08 Dec 2004 04:15:27 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.333,1.334 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv5309 Modified Files: ChangeLog Log Message: (IrcStatusBot.getBuilder): catch the KeyError that happens when you ask for a non-existent Builder, and translate it into a UsageError. Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.333 retrieving revision 1.334 diff -u -d -r1.333 -r1.334 --- ChangeLog 8 Dec 2004 03:54:56 -0000 1.333 +++ ChangeLog 8 Dec 2004 04:15:24 -0000 1.334 @@ -1,5 +1,9 @@ 2004-12-07 Brian Warner + * buildbot/status/words.py (IrcStatusBot.getBuilder): catch the + KeyError that happens when you ask for a non-existent Builder, and + translate it into a UsageError. + * buildbot/test/test_run.py (Disconnect.testBuild4): validate that losing the slave in the middle of a remote step is handled too From warner at users.sourceforge.net Thu Dec 9 10:24:58 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Thu, 09 Dec 2004 10:24:58 +0000 Subject: [Buildbot-commits] buildbot/buildbot/process step_twisted.py,1.63,1.64 step.py,1.58,1.59 builder.py,1.20,1.21 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/process In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv18091/buildbot/process Modified Files: step_twisted.py step.py builder.py Log Message: * buildbot/process/step_twisted.py (Trial._commandComplete): update self.cmd when we start the 'cat test.log' transfer. Without this, we cannot interrupt the correct RemoteCommand when we lose the connection. * buildbot/process/step.py (RemoteCommand.interrupt): don't bother trying to tell the slave to stop the command if we're already inactive, or if we no longer have a .remote * buildbot/process/builder.py (Builder._detached): don't let an exception in currentBuild.stopBuild() prevent the builder from being marked offline Index: builder.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/process/builder.py,v retrieving revision 1.20 retrieving revision 1.21 diff -u -d -r1.20 -r1.21 --- builder.py 6 Dec 2004 07:36:33 -0000 1.20 +++ builder.py 9 Dec 2004 10:24:56 -0000 1.21 @@ -183,7 +183,11 @@ if self.currentBuild: log.msg("%s._detached: killing build" % self) # wasn't enough - self.currentBuild.stopBuild("slave lost") + try: + self.currentBuild.stopBuild("slave lost") + except: + log.msg("currentBuild.stopBuild failed") + log.err() self.currentBuild = None # TODO: should failover to a new Build self.builder_status.addPointEvent(['disconnect']) Index: step.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/process/step.py,v retrieving revision 1.58 retrieving revision 1.59 diff -u -d -r1.58 -r1.59 --- step.py 8 Dec 2004 03:54:56 -0000 1.58 +++ step.py 9 Dec 2004 10:24:56 -0000 1.59 @@ -82,18 +82,38 @@ return d def interrupt(self, why): + # TODO: consider separating this into interrupt() and stop(), where + # stop() unconditionally calls _finished, but interrupt() merely + # asks politely for the command to stop soon. + + log.msg("RemoteCommand.interrupt", self, why) + if not self.active: + log.msg(" but this RemoteCommand is already inactive") + return + if not self.remote: + log.msg(" but our .remote went away") + return if isinstance(why, Failure) and why.check(error.ConnectionLost): - log.msg("RemoteCommand.disconnect: lost slave", self) + log.msg("RemoteCommand.disconnect: lost slave") self.remote = None - self._finished(Failure(error.ConnectionLost())) + self._finished(why) return # tell the remote command to halt. Returns a Deferred that will fire # when the interrupt command has been delivered. + d = defer.maybeDeferred(self.remote.callRemote, "interruptCommand", self.commandID, str(why)) + # the slave may not have remote_interruptCommand + d.addErrback(self._interruptFailed) return d + def _interruptFailed(self, why): + log.msg("RemoteCommand._interruptFailed", self) + # TODO: forcibly stop the Command now, since we can't stop it + # cleanly + return None + def remote_update(self, updates): max_updatenum = 0 for (update, num) in updates: Index: step_twisted.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/process/step_twisted.py,v retrieving revision 1.63 retrieving revision 1.64 diff -u -d -r1.63 -r1.64 --- step_twisted.py 23 Nov 2004 01:13:54 -0000 1.63 +++ step_twisted.py 9 Dec 2004 10:24:56 -0000 1.64 @@ -373,6 +373,7 @@ c2 = step.RemoteShellCommand(command=catcmd, workdir=self.workdir, ) + self.cmd = c2 loog = self.addLog("test.log") c2.useLog(loog, True) d = c2.run(self, self.remote) From warner at users.sourceforge.net Thu Dec 9 10:24:59 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Thu, 09 Dec 2004 10:24:59 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.334,1.335 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv18091 Modified Files: ChangeLog Log Message: * buildbot/process/step_twisted.py (Trial._commandComplete): update self.cmd when we start the 'cat test.log' transfer. Without this, we cannot interrupt the correct RemoteCommand when we lose the connection. * buildbot/process/step.py (RemoteCommand.interrupt): don't bother trying to tell the slave to stop the command if we're already inactive, or if we no longer have a .remote * buildbot/process/builder.py (Builder._detached): don't let an exception in currentBuild.stopBuild() prevent the builder from being marked offline Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.334 retrieving revision 1.335 diff -u -d -r1.334 -r1.335 --- ChangeLog 8 Dec 2004 04:15:24 -0000 1.334 +++ ChangeLog 9 Dec 2004 10:24:56 -0000 1.335 @@ -1,3 +1,18 @@ +2004-12-09 Brian Warner + + * buildbot/process/step_twisted.py (Trial._commandComplete): + update self.cmd when we start the 'cat test.log' transfer. Without + this, we cannot interrupt the correct RemoteCommand when we lose + the connection. + + * buildbot/process/step.py (RemoteCommand.interrupt): don't bother + trying to tell the slave to stop the command if we're already + inactive, or if we no longer have a .remote + + * buildbot/process/builder.py (Builder._detached): don't let an + exception in currentBuild.stopBuild() prevent the builder from + being marked offline + 2004-12-07 Brian Warner * buildbot/status/words.py (IrcStatusBot.getBuilder): catch the From warner at users.sourceforge.net Thu Dec 9 10:26:16 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Thu, 09 Dec 2004 10:26:16 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.335,1.336 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv18350 Modified Files: ChangeLog Log Message: (StatusResourceBuilder.getChild): remove debug message Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.335 retrieving revision 1.336 diff -u -d -r1.335 -r1.336 --- ChangeLog 9 Dec 2004 10:24:56 -0000 1.335 +++ ChangeLog 9 Dec 2004 10:26:14 -0000 1.336 @@ -1,5 +1,8 @@ 2004-12-09 Brian Warner + * buildbot/status/html.py (StatusResourceBuilder.getChild): remove + debug message + * buildbot/process/step_twisted.py (Trial._commandComplete): update self.cmd when we start the 'cat test.log' transfer. Without this, we cannot interrupt the correct RemoteCommand when we lose From warner at users.sourceforge.net Thu Dec 9 10:26:16 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Thu, 09 Dec 2004 10:26:16 +0000 Subject: [Buildbot-commits] buildbot/buildbot/status html.py,1.48,1.49 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/status In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv18350/buildbot/status Modified Files: html.py Log Message: (StatusResourceBuilder.getChild): remove debug message Index: html.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/status/html.py,v retrieving revision 1.48 retrieving revision 1.49 diff -u -d -r1.48 -r1.49 --- html.py 6 Dec 2004 07:21:51 -0000 1.48 +++ html.py 9 Dec 2004 10:26:13 -0000 1.49 @@ -411,8 +411,6 @@ return Redirect("..") def getChild(self, path, request): - log.msg('path=%s, postpath=%s, prepath=%s' % (path, request.postpath, - request.prepath)) if path == "force": return self.force(request) if path == "ping": @@ -548,6 +546,7 @@ if data: self.req.write(data) return + # TODO: I've seen this double-print a line # now send all of .runEntries in a batch data = self.content(self.original.runEntries) if data: From warner at users.sourceforge.net Sat Dec 11 11:12:45 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 11 Dec 2004 11:12:45 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.336,1.337 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3363 Modified Files: ChangeLog Log Message: * buildbot/slave/bot.py (BotFactory): revamp keepalive/lost-master detection code. Require some sign of life from the buildmaster every BotFactory.keepaliveInterval seconds. Provoke this indication at BotFactory.keepaliveTimeout seconds before the deadline by sending a keepalive request. We don't actually care if that request is answered in a timely fashion, what we care about is that .activity() is called before the deadline. .activity() is triggered by any PB message from the master (including an ack to one of the slave's status-update messages). With this new scheme, large status messages over slow pipes are OK, as long as any given message can be sent (and thus acked) within .keepaliveTimeout seconds (which defaults to 30). (SlaveBuilder.remote_startCommand): record activity (SlaveBuilder.ackUpdate): same (SlaveBuilder.ackComplete): same (BotFactory.gotPerspective): same * buildbot/test/test_run.py (Disconnect.testSlaveTimeout): test it Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.336 retrieving revision 1.337 diff -u -d -r1.336 -r1.337 --- ChangeLog 9 Dec 2004 10:26:14 -0000 1.336 +++ ChangeLog 11 Dec 2004 11:12:35 -0000 1.337 @@ -1,3 +1,23 @@ +2004-12-11 Brian Warner + + * buildbot/slave/bot.py (BotFactory): revamp keepalive/lost-master + detection code. Require some sign of life from the buildmaster + every BotFactory.keepaliveInterval seconds. Provoke this + indication at BotFactory.keepaliveTimeout seconds before the + deadline by sending a keepalive request. We don't actually care if + that request is answered in a timely fashion, what we care about + is that .activity() is called before the deadline. .activity() is + triggered by any PB message from the master (including an ack to + one of the slave's status-update messages). With this new scheme, + large status messages over slow pipes are OK, as long as any given + message can be sent (and thus acked) within .keepaliveTimeout + seconds (which defaults to 30). + (SlaveBuilder.remote_startCommand): record activity + (SlaveBuilder.ackUpdate): same + (SlaveBuilder.ackComplete): same + (BotFactory.gotPerspective): same + * buildbot/test/test_run.py (Disconnect.testSlaveTimeout): test it + 2004-12-09 Brian Warner * buildbot/status/html.py (StatusResourceBuilder.getChild): remove From warner at users.sourceforge.net Sat Dec 11 11:12:37 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 11 Dec 2004 11:12:37 +0000 Subject: [Buildbot-commits] buildbot/buildbot/test test_run.py,1.20,1.21 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/test In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3363/buildbot/test Modified Files: test_run.py Log Message: * buildbot/slave/bot.py (BotFactory): revamp keepalive/lost-master detection code. Require some sign of life from the buildmaster every BotFactory.keepaliveInterval seconds. Provoke this indication at BotFactory.keepaliveTimeout seconds before the deadline by sending a keepalive request. We don't actually care if that request is answered in a timely fashion, what we care about is that .activity() is called before the deadline. .activity() is triggered by any PB message from the master (including an ack to one of the slave's status-update messages). With this new scheme, large status messages over slow pipes are OK, as long as any given message can be sent (and thus acked) within .keepaliveTimeout seconds (which defaults to 30). (SlaveBuilder.remote_startCommand): record activity (SlaveBuilder.ackUpdate): same (SlaveBuilder.ackComplete): same (BotFactory.gotPerspective): same * buildbot/test/test_run.py (Disconnect.testSlaveTimeout): test it Index: test_run.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/test/test_run.py,v retrieving revision 1.20 retrieving revision 1.21 diff -u -d -r1.20 -r1.21 --- test_run.py 8 Dec 2004 03:54:56 -0000 1.20 +++ test_run.py 11 Dec 2004 11:12:33 -0000 1.21 @@ -8,6 +8,7 @@ #log.startLogging(sys.stderr) from buildbot import master, interfaces +from buildbot.util import now from buildbot.slave import bot from buildbot.changes.changes import Change from buildbot.status import builder @@ -157,12 +158,26 @@ self.slave2 = slave slave.startService() + def connectSlave3(self): + # this slave has a very fast keepalive timeout + port = self.master.slavePort._port.getHost().port + os.mkdir("slavebase") + slave = MyBuildSlave("localhost", port, "bot1", "sekrit", + "slavebase", keepalive=2, usePTY=1, + keepaliveTimeout=1) + slave.info = {"admin": "one"} + self.slave = slave + slave.startService() + d = self.master.botmaster.waitUntilBuilderAttached("dummy") + dr(d) + def tearDown(self): log.msg("doing tearDown") self.shutdownSlave() if self.master: dr(defer.maybeDeferred(self.master.stopService)) self.master = None + log.msg("tearDown done") # various forms of slave death @@ -342,7 +357,7 @@ class Disconnect(RunMixin, unittest.TestCase): - def disconnectSetup(self): + def disconnectSetupMaster(self): # verify that disconnecting the slave during a build properly # terminates the build m = self.master @@ -360,11 +375,21 @@ self.failUnlessEqual(s1.getCurrentBuild(), None) self.failUnlessEqual(s1.getLastFinishedBuild(), None) self.failUnlessEqual(s1.getBuild(-1), None) + return m,s,c,s1 + def disconnectSetup(self): + m,s,c,s1 = self.disconnectSetupMaster() self.connectSlave() self.failUnlessEqual(s1.getState(), ("idle", None, None)) return m,s,c,s1 + def disconnectSetup2(self): + m,s,c,s1 = self.disconnectSetupMaster() + self.connectSlave3() + self.failUnlessEqual(s1.getState(), ("idle", None, None)) + return m,s,c,s1 + + def verifyDisconnect(self, bs): self.failUnless(bs.isFinished()) @@ -413,6 +438,41 @@ print bs.getText() testIdle2.skip = "short timeout not yet implemented" + def testSlaveTimeout(self): + m,s,c,s1 = self.disconnectSetup2() # fast timeout + + # now suppose the slave goes missing. We want to find out when it + # creates a new Broker, so we reach inside and mark it with the + # well-known sigil of impending messy death. + bd = self.slave.getServiceNamed("bot").builders["dummy"] + broker = bd.remote.broker + broker.redshirt = 1 + + # make sure the keepalives will keep the connection up + later = now() + 5 + while 1: + if now() > later: + break + bd = self.slave.getServiceNamed("bot").builders["dummy"] + if not bd.remote or not hasattr(bd.remote.broker, "redshirt"): + self.fail("slave disconnected when it shouldn't have") + reactor.iterate(0.01) + + d = self.master.botmaster.waitUntilBuilderDetached("dummy") + # whoops! how careless of me. + self.disappearSlave() + + # the slave will realize the connection is lost within 2 seconds, and + # reconnect. + dr(d, 5) + d = self.master.botmaster.waitUntilBuilderAttached("dummy") + dr(d, 5) + # make sure it is a new connection (i.e. a new Broker) + bd = self.slave.getServiceNamed("bot").builders["dummy"] + self.failUnless(bd.remote, "hey, slave isn't really connected") + self.failIf(hasattr(bd.remote.broker, "redshirt"), + "hey, slave's Broker is still marked for death") + def testBuild1(self): m,s,c,s1 = self.disconnectSetup() # this next sequence is timing-dependent. The dummy build takes at From warner at users.sourceforge.net Sat Dec 11 11:12:37 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 11 Dec 2004 11:12:37 +0000 Subject: [Buildbot-commits] buildbot/buildbot/slave bot.py,1.5,1.6 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot/slave In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv3363/buildbot/slave Modified Files: bot.py Log Message: * buildbot/slave/bot.py (BotFactory): revamp keepalive/lost-master detection code. Require some sign of life from the buildmaster every BotFactory.keepaliveInterval seconds. Provoke this indication at BotFactory.keepaliveTimeout seconds before the deadline by sending a keepalive request. We don't actually care if that request is answered in a timely fashion, what we care about is that .activity() is called before the deadline. .activity() is triggered by any PB message from the master (including an ack to one of the slave's status-update messages). With this new scheme, large status messages over slow pipes are OK, as long as any given message can be sent (and thus acked) within .keepaliveTimeout seconds (which defaults to 30). (SlaveBuilder.remote_startCommand): record activity (SlaveBuilder.ackUpdate): same (SlaveBuilder.ackComplete): same (BotFactory.gotPerspective): same * buildbot/test/test_run.py (Disconnect.testSlaveTimeout): test it Index: bot.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/bot.py,v retrieving revision 1.5 retrieving revision 1.6 diff -u -d -r1.5 -r1.6 --- bot.py 4 Dec 2004 21:02:03 -0000 1.5 +++ bot.py 11 Dec 2004 11:12:35 -0000 1.6 @@ -8,6 +8,7 @@ from twisted.application import service, internet from twisted.cred import credentials +from buildbot.util import now from buildbot.pbutil import ReconnectingPBClientFactory from buildbot.slave import registry # make sure the standard commands get registered @@ -81,6 +82,12 @@ if self.stopCommandOnShutdown: self.stopCommand() + def activity(self): + #bot = self.parent + #buildslave = bot.parent + #bf = buildslave.bf + self.parent.parent.bf.activity() + def remote_setMaster(self, remote): self.remote = remote self.remote.notifyOnDisconnect(self.lostRemote) @@ -110,6 +117,8 @@ """This is called multiple times by various master-side BuildSteps, to start various commands that actually do the build.""" + self.activity() + if self.command: log.msg("leftover command, dropping it") self.stopCommand() @@ -132,6 +141,7 @@ def remote_interruptCommand(self, stepId, why): """Halt the current step.""" log.msg("asked to interrupt current command: %s" % why) + self.activity() if not self.command: # TODO: just log it, a race could result in their interrupting a # command that wasn't actually running @@ -175,12 +185,10 @@ d.addErrback(self._ackFailed, "SlaveBuilder.sendUpdate") def ackUpdate(self, acknum): - # TODO: update the "last activity" timer - pass + self.activity() # update the "last activity" timer def ackComplete(self, dummy): - # TODO: update the "last activity" timer - pass + self.activity() # update the "last activity" timer def _ackFailed(self, why, where): log.msg("SlaveBuilder._ackFailed:", where) @@ -283,13 +291,32 @@ d.addCallbacks(log.msg, log.err) class BotFactory(ReconnectingPBClientFactory): - keepaliveTimeout = 30 + # 'keepaliveInterval' serves two purposes. The first is to keep the + # connection alive: it guarantees that there will be at least some + # traffic once every 'keepaliveInterval' seconds, which may help keep an + # interposed NAT gateway from dropping the address mapping because it + # thinks the connection has been abandoned. The second is to put an upper + # limit on how long the buildmaster might have gone away before we notice + # it. For this second purpose, we insist upon seeing *some* evidence of + # the buildmaster at least once every 'keepaliveInterval' seconds. + keepaliveInterval = None # None = do not use keepalives + + # 'keepaliveTimeout' seconds before the interval expires, we will send a + # keepalive request, both to add some traffic to the connection, and to + # prompt a response from the master in case all our builders are idle. We + # don't insist upon receiving a timely response from this message: a slow + # link might put the request at the wrong end of a large build message. + keepaliveTimeout = 30 # how long we will go without a response + + keepaliveTimer = None + activityTimer = None + lastActivity = 0 unsafeTracebacks = 1 - def __init__(self, keepaliveInterval=0): + def __init__(self, keepaliveInterval, keepaliveTimeout): ReconnectingPBClientFactory.__init__(self) - self.keepaliveTimer = None self.keepaliveInterval = keepaliveInterval + self.keepaliveTimeout = keepaliveTimeout def startedConnecting(self, connector): ReconnectingPBClientFactory.startedConnecting(self, connector) @@ -304,10 +331,11 @@ log.msg("unable to set SO_KEEPALIVE") if not self.keepaliveInterval: self.keepaliveInterval = 10*60 + self.activity() if self.keepaliveInterval: log.msg("sending application-level keepalives every %d seconds" \ % self.keepaliveInterval) - self.startKeepaliveTimer() + self.startTimers() def clientConnectionFailed(self, connector, reason): self.connector = None @@ -316,55 +344,74 @@ def clientConnectionLost(self, connector, reason): self.connector = None + self.stopTimers() + self.perspective = None ReconnectingPBClientFactory.clientConnectionLost(self, connector, reason) + + def startTimers(self): + assert self.keepaliveInterval + assert not self.keepaliveTimer + assert not self.activityTimer + # Insist that doKeepalive fires before checkActivity. Really, it + # needs to happen at least one RTT beforehand. + assert self.keepaliveInterval > self.keepaliveTimeout + + # arrange to send a keepalive a little while before our deadline + when = self.keepaliveInterval - self.keepaliveTimeout + self.keepaliveTimer = reactor.callLater(when, self.doKeepalive) + # and check for activity too + self.activityTimer = reactor.callLater(self.keepaliveInterval, + self.checkActivity) + + def stopTimers(self): if self.keepaliveTimer: self.keepaliveTimer.cancel() self.keepaliveTimer = None - self.perspective = None + if self.activityTimer: + self.activityTimer.cancel() + self.activityTimer = None - def startKeepaliveTimer(self): - """Once this is called, the bot will send an application-level - keepalive query every self.keepaliveInterval seconds. If the master - has not responded within self.keepaliveTimeout seconds, the link - is deemed dead and we will initiate a reconnect. - """ - self.keepaliveTimer = reactor.callLater(self.keepaliveInterval, - self.doKeepalive) + def activity(self, res=None): + self.lastActivity = now() def doKeepalive(self): + # send the keepalive request. If it fails outright, the connection + # was already dropped, so just log and ignore. self.keepaliveTimer = None log.msg("sending app-level keepalive") d = self.perspective.callRemote("keepalive") - d.setTimeout(self.keepaliveTimeout) - # TODO: when the timeout is invoked, the PBConnectionLost causes an - # AlreadyCalledError + d.addCallback(self.activity) d.addErrback(self.keepaliveLost) - self.startKeepaliveTimer() - def keepaliveLost(self, why): - # either the network connection was lost (why == StaleReference or - # something), or the network has silently gone away (perhaps a NAT - # box has forgotten about us) (why == TimeoutError). - log.msg("keepaliveLost", why) - self.perspective.broker.transport.loseConnection() + def keepaliveLost(self, f): + log.msg("BotFactory.keepaliveLost") + + def checkActivity(self): + self.activityTimer = None + if self.lastActivity + self.keepaliveInterval < now(): + log.msg("BotFactory.checkActivity: nothing from master for " + "%d secs" % (now() - self.lastActivity)) + self.perspective.broker.transport.loseConnection() + return + self.startTimers() def stopFactory(self): ReconnectingPBClientFactory.stopFactory(self) - if self.keepaliveTimer: - self.keepaliveTimer.cancel() - self.keepaliveTimer = None + self.stopTimers() class BuildSlave(service.MultiService): botClass = Bot def __init__(self, host, port, name, passwd, basedir, keepalive, - usePTY): + usePTY, keepaliveTimeout=30): service.MultiService.__init__(self) bot = self.botClass(basedir, usePTY) bot.setServiceParent(self) - bf = self.bf = BotFactory(keepalive) + if keepalive == 0: + keepalive = None + bf = self.bf = BotFactory(keepalive, keepaliveTimeout) bf.startLogin(credentials.UsernamePassword(name, passwd), client=bot) self.connection = c = internet.TCPClient(host, port, bf) c.setServiceParent(self) From warner at users.sourceforge.net Sat Dec 11 20:59:08 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 11 Dec 2004 20:59:08 +0000 Subject: [Buildbot-commits] buildbot NEWS,1.34,1.35 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv24541 Modified Files: NEWS Log Message: bring it up to date Index: NEWS =================================================================== RCS file: /cvsroot/buildbot/buildbot/NEWS,v retrieving revision 1.34 retrieving revision 1.35 diff -u -d -r1.34 -r1.35 --- NEWS 6 Dec 2004 08:09:30 -0000 1.34 +++ NEWS 11 Dec 2004 20:59:05 -0000 1.35 @@ -12,7 +12,9 @@ is added to a pseudo-logfile for the step that was stopped, but if you only look at the top-level status it appears that the build failed on its own). -Builds are also halted if the connection to the buildmaster is lost. +Builds are also halted if the connection to the buildslave is lost. On the +slave side, any active commands are halted if the connection to the +buildmaster is lost. ** minor new features @@ -33,9 +35,25 @@ happen when the slave is sending large logfiles over a slow link, while using short keepalive timeouts. The buildmaster has been fixed to allow the second connection attempt to take precedence over the first, so that the older -connection is jettisoned to make way for the newer one. This is half of the -fix, the other will involve fixing the slave to avoid the double-connect -situation in the first place. +connection is jettisoned to make way for the newer one. + +In addition, the buildslave has been fixed to be less twitchy about timeouts. +There are now two parameters: keepaliveInterval (which is controlled by the +mktap 'keepalive' argument), and keepaliveTimeout (which requires editing the +.py source to change from the default of 30 seconds). The slave expects to +see *something* from the master at least once every keepaliveInterval +seconds, and will try to provoke a response (by sending a keepalive request) +'keepaliveTimeout' seconds before the end of this interval just in case there +was no regular traffic. Any kind of traffic will qualify, including +acknowledgements of normal build-status updates. + +The net result is that, as long as any given PB message can be sent over the +wire in less than 'keepaliveTimeout' seconds, the slave should not mistakenly +disconnect because of a timeout. There will be traffic on the wire at least +every 'keepaliveInterval' seconds, which is what you want to pay attention to +if you're trying to keep an intervening NAT box from dropping what it thinks +is an abandoned connection. A quiet loss of connection will be detected +within 'keepaliveInterval' seconds. *** Large Logfiles From warner at users.sourceforge.net Sat Dec 11 20:59:41 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Sat, 11 Dec 2004 20:59:41 +0000 Subject: [Buildbot-commits] buildbot ChangeLog,1.337,1.338 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv24604 Modified Files: ChangeLog Log Message: NEWS: bring it up to date Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.337 retrieving revision 1.338 diff -u -d -r1.337 -r1.338 --- ChangeLog 11 Dec 2004 11:12:35 -0000 1.337 +++ ChangeLog 11 Dec 2004 20:59:39 -0000 1.338 @@ -1,5 +1,7 @@ 2004-12-11 Brian Warner + * NEWS: bring it up to date + * buildbot/slave/bot.py (BotFactory): revamp keepalive/lost-master detection code. Require some sign of life from the buildmaster every BotFactory.keepaliveInterval seconds. Provoke this From warner at users.sourceforge.net Mon Dec 13 08:23:44 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 13 Dec 2004 08:23:44 +0000 Subject: [Buildbot-commits] buildbot/debian changelog,1.3,1.4 Message-ID: Update of /cvsroot/buildbot/buildbot/debian In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv15106/debian Modified Files: changelog Log Message: * buildbot/__init__.py (version): Releasing buildbot-0.6.2 * debian/changelog: update for 0.6.2 * NEWS: finalize for 0.6.2 Index: changelog =================================================================== RCS file: /cvsroot/buildbot/buildbot/debian/changelog,v retrieving revision 1.3 retrieving revision 1.4 diff -u -d -r1.3 -r1.4 --- changelog 6 Dec 2004 08:47:40 -0000 1.3 +++ changelog 13 Dec 2004 08:23:41 -0000 1.4 @@ -1,3 +1,9 @@ +buildbot (0.6.2-1) unstable; urgency=low + + * New upstream release + + -- Brian Warner Mon, 13 Dec 2004 00:22:46 -0800 + buildbot (0.6.1-1) unstable; urgency=low * New upstream release From warner at users.sourceforge.net Mon Dec 13 08:23:44 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 13 Dec 2004 08:23:44 +0000 Subject: [Buildbot-commits] buildbot/buildbot __init__.py,1.6,1.7 Message-ID: Update of /cvsroot/buildbot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv15106/buildbot Modified Files: __init__.py Log Message: * buildbot/__init__.py (version): Releasing buildbot-0.6.2 * debian/changelog: update for 0.6.2 * NEWS: finalize for 0.6.2 Index: __init__.py =================================================================== RCS file: /cvsroot/buildbot/buildbot/buildbot/__init__.py,v retrieving revision 1.6 retrieving revision 1.7 diff -u -d -r1.6 -r1.7 --- __init__.py 24 Nov 2004 02:37:12 -0000 1.6 +++ __init__.py 13 Dec 2004 08:23:42 -0000 1.7 @@ -1,3 +1,3 @@ #! /usr/bin/python -version = "0.6.1+" +version = "0.6.2" From warner at users.sourceforge.net Mon Dec 13 08:23:44 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 13 Dec 2004 08:23:44 +0000 Subject: [Buildbot-commits] buildbot NEWS,1.35,1.36 ChangeLog,1.338,1.339 Message-ID: Update of /cvsroot/buildbot/buildbot In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv15106 Modified Files: NEWS ChangeLog Log Message: * buildbot/__init__.py (version): Releasing buildbot-0.6.2 * debian/changelog: update for 0.6.2 * NEWS: finalize for 0.6.2 Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v retrieving revision 1.338 retrieving revision 1.339 diff -u -d -r1.338 -r1.339 --- ChangeLog 11 Dec 2004 20:59:39 -0000 1.338 +++ ChangeLog 13 Dec 2004 08:23:42 -0000 1.339 @@ -1,3 +1,10 @@ +2004-12-13 Brian Warner + + * buildbot/__init__.py (version): Releasing buildbot-0.6.2 + + * debian/changelog: update for 0.6.2 + * NEWS: finalize for 0.6.2 + 2004-12-11 Brian Warner * NEWS: bring it up to date Index: NEWS =================================================================== RCS file: /cvsroot/buildbot/buildbot/NEWS,v retrieving revision 1.35 retrieving revision 1.36 diff -u -d -r1.35 -r1.36 --- NEWS 11 Dec 2004 20:59:05 -0000 1.35 +++ NEWS 13 Dec 2004 08:23:42 -0000 1.36 @@ -1,6 +1,6 @@ User visible changes in Buildbot. -* Release ? (?) +* Release 0.6.2 (13 Dec 2004) ** new features From warner at users.sourceforge.net Mon Dec 13 08:39:38 2004 From: warner at users.sourceforge.net (Brian Warner) Date: Mon, 13 Dec 2004 08:39:38 +0000 Subject: [Buildbot-commits] site ChangeLog,1.17,1.18 NEWS,1.3,1.4 index.html,1.33,1.34 Message-ID: Update of /cvsroot/buildbot/site In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv18329 Modified Files: ChangeLog NEWS index.html Log Message: update for 0.6.2, add spamassassin.org buildbot Index: ChangeLog =================================================================== RCS file: /cvsroot/buildbot/site/ChangeLog,v retrieving revision 1.17 retrieving revision 1.18 diff -u -d -r1.17 -r1.18 --- ChangeLog 23 Nov 2004 11:41:00 -0000 1.17 +++ ChangeLog 13 Dec 2004 08:39:34 -0000 1.18 @@ -1,3 +1,289 @@ +2004-12-13 Brian Warner + + * buildbot/__init__.py (version): Releasing buildbot-0.6.2 + + * debian/changelog: update for 0.6.2 + * NEWS: finalize for 0.6.2 + +2004-12-11 Brian Warner + + * NEWS: bring it up to date + + * buildbot/slave/bot.py (BotFactory): revamp keepalive/lost-master + detection code. Require some sign of life from the buildmaster + every BotFactory.keepaliveInterval seconds. Provoke this + indication at BotFactory.keepaliveTimeout seconds before the + deadline by sending a keepalive request. We don't actually care if + that request is answered in a timely fashion, what we care about + is that .activity() is called before the deadline. .activity() is + triggered by any PB message from the master (including an ack to + one of the slave's status-update messages). With this new scheme, + large status messages over slow pipes are OK, as long as any given + message can be sent (and thus acked) within .keepaliveTimeout + seconds (which defaults to 30). + (SlaveBuilder.remote_startCommand): record activity + (SlaveBuilder.ackUpdate): same + (SlaveBuilder.ackComplete): same + (BotFactory.gotPerspective): same + * buildbot/test/test_run.py (Disconnect.testSlaveTimeout): test it + +2004-12-09 Brian Warner + + * buildbot/status/html.py (StatusResourceBuilder.getChild): remove + debug message + + * buildbot/process/step_twisted.py (Trial._commandComplete): + update self.cmd when we start the 'cat test.log' transfer. Without + this, we cannot interrupt the correct RemoteCommand when we lose + the connection. + + * buildbot/process/step.py (RemoteCommand.interrupt): don't bother + trying to tell the slave to stop the command if we're already + inactive, or if we no longer have a .remote + + * buildbot/process/builder.py (Builder._detached): don't let an + exception in currentBuild.stopBuild() prevent the builder from + being marked offline + +2004-12-07 Brian Warner + + * buildbot/status/words.py (IrcStatusBot.getBuilder): catch the + KeyError that happens when you ask for a non-existent Builder, and + translate it into a UsageError. + + * buildbot/test/test_run.py (Disconnect.testBuild4): validate that + losing the slave in the middle of a remote step is handled too + + * buildbot/process/step.py (ShellCommand.interrupt): 'reason' can + be a Failure, so be sure to stringify it before using it as the + contents of the 'interrupt' logfile + (RemoteCommand.interrupt): use stringified 'why' in + remote_interruptCommand too, just in case + +2004-12-06 Brian Warner + + * buildbot/slave/commands.py (Arch.doVCUpdate): use 'tla replay' + instead of 'tla update', which is more efficient in case we've + missed a couple of patches since the last update. + + * debian/changelog: update for previous (0.6.1) release. Obviously + this needs to be handled better. + +2004-12-05 Brian Warner + + * NEWS: update for stuff since last release + + * buildbot/master.py (DebugPerspective.attached): return 'self', to + match the maybeDeferred change in Dispatcher.requestAvatar + * buildbot/changes/pb.py (ChangePerspective.attached): same + * buildbot/status/client.py (StatusClientPerspective.attached): same + * buildbot/process/builder.py (Builder._attached3): same + * buildbot/pbutil.py (NewCredPerspective.attached): same + + * buildbot/status/html.py (WaterfallStatusResource.phase2): Add + the date to the top-most box, if it is not the same as today's + date. + + * docs/slave.xhtml: provide a buildslave setup checklist + + * docs/source.xhtml (Arch): correct terminology + +2004-12-04 Brian Warner + + * buildbot/test/test_slavecommand.py: use sys.executable instead + of hard-coding 'python' for child commands, might help portability + + * docs/examples/twisted_master.cfg: update to current usage + + * buildbot/status/words.py (IrcStatusBot.command_STOP): add a + 'stop build' command to the IRC bot + + * buildbot/master.py (Dispatcher.requestAvatar): remove debug + message that broke PBChangeSource + + * buildbot/slave/bot.py: clean up shutdown/lose-master code + (SlaveBuilder): make some attributes class-level, remove the old + "update queue" which existed to support resuming a build after the + master connection was lost. Try to reimplement that feature later. + (SlaveBuilder.stopCommand): clear self.command when the + SlaveCommand finishes, so that we don't try to kill a leftover one + at shutdown time. + (SlaveBuilder.commandComplete): same, merge with commandFailed and + .finishCommand + + * buildbot/slave/commands.py (SourceBase): set self.command for + all VC commands, so they can be interrupted. + +2004-12-03 Brian Warner + + * buildbot/master.py: clean up slave-handling code, to handle + slave-disconnect and multiple-connect better + (BotPerspective): make these long-lasting, exactly one per bot + listed in the config file. + (BotPerspective.attached): if a slave connects while an existing + one appears to still be connected, disconnect the old one first. + (BotPerspective.disconnect): new method to forcibly disconnect a + buildslave. Use some hacks to empty the transmit buffer quickly to + avoid the long (20-min?) TCP timeout that could occur if the old + slave has dropped off the net. + (BotMaster): Keep persistent BotPerspectives in .slaves, let them + own their own SlaveStatus objects. Remove .attached/.detached, add + .addSlave/.removeSlave, treat slaves like Builders (config file + parsing sends deltas to the BotMaster). Inform the slave + instances, i.e. the BotPerspective, about addBuilder and + removeBuilder. + (BotMaster.getPerspective): turns into a single dict lookup + (Dispatcher.requestAvatar): allow .attached to return a Deferred, + which gives BotPerspective.attached a chance to disconnect the old + slave first. + (BuildMaster.loadConfig): add code (disabled) to validate that all + builders use known slaves (listed in c['bots']). The check won't + work with tuple-specified builders, which are deprecated but not + yet invalid, so the check is disabled for now. + (BuildMaster.loadConfig_Slaves): move slave-config into a separate + routine, do the add/changed/removed dance with them like we do + with builders. + (BuildMaster.loadConfig_Sources): move source-config into a + separate routine too + + * buildbot/status/builder.py (Status.getSlave): get the + SlaveStatus object from the BotPerspective, not the BotMaster. + + * buildbot/test/test_run.py: bunch of new tests for losing the + buildslave at various points in the build, handling a slave that + connects multiple times, and making sure we can interrupt a + running build + + * buildbot/slave/bot.py (BuildSlave): make it possible to use + something other than 'Bot' for the Bot object, to make certain + test cases easier to write. + (BuildSlave.waitUntilDisconnected): utility method for testing + +2004-11-30 Brian Warner + + * buildbot/test/test_run.py (RunMixin): refactor, remove debug msg + + * buildbot/interfaces.py (IBuilderControl.ping): add timeout= + argument, return a Deferred that always fires with True or False. + I don't use an errback to indicate 'ping failed' so that callers + are free to ignore the deferred without causing spurious errors in + the logs. + * buildbot/process/builder.py (BuilderControl.ping): implement it + + * buildbot/test/test_run.py (Status.testDisappear): test ping + (Status.disappearSlave): fix it + +2004-11-30 Brian Warner + + * buildbot/interfaces.py (IBuildControl): add .stopBuild + (IBuilderControl): add .getBuild(num), only works for the current + build, of course, although it might be interesting to offer + something for builds in the .waiting or .interlocked state. + + * buildbot/process/base.py (Build): have .stopBuild just do the + interrupt, then let the build die by itself. + (BuildControl): add .stopBuild, and add a point-event named + 'interrupt' just after the build so status viewers can tell that + someone killed it. + (BuilderControl): add .getBuild + + * buildbot/process/step.py (Dummy): use haltOnFailure so it really + stops when you kill it, good for testing + (ShellCommand.interrupt): add a logfile named 'interrupt' which + contains the 'reason' text. + + * buildbot/status/html.py: Add Stop Build button, if the build can + still be stopped. Send a Redirect (to the top page) one second + later, hopefully long enough for the interrupt to have an effect. + Move make_row() up to top-level to share it between Stop Build and + Force Build. + + * buildbot/slave/commands.py: only kill the child process once + + * buildbot/test/test_run.py: add testInterrupt + +2004-11-29 Brian Warner + + * buildbot/process/base.py: Refactor command interruption. The + Build is now responsible for noticing that the slave has gone + away: Build.lostRemote() interrupts the current step and makes + sure that no further ones will be started. + + * buildbot/process/builder.py: When the initial remote_startBuild + message fails, log it: this usually indicates that the slave has + gone away, but we don't really start paying attention until they + fail to respond to the first step's command. + + * buildbot/process/step.py (RemoteCommand): Does *not* watch for + slave disconnect. Now sports a new interrupt() method. Error + handling was simplified a lot by chaining deferreds, so + remoteFailed/remoteComplete were merged into a single + remoteComplete method (which can now get a Failure object). + Likewise failed/finished were merged into just _finished. + (BuildStep): Add interrupt(why) method, and if why is a + ConnectionLost Failure then the step is failed with some useful + error text. + + * buildbot/slave/bot.py: stop the current command when the remote + Step reference is lost, and when the slave is shut down. + (Bot): make it a MultiService, so it can have children. Use + stopService to tell when the slave is shutting down. + (SlaveBuilder): make it a Service, and a child of the Bot. Add + remote_interruptCommand (which asks the current SlaveCommand to + stop but allows it to keep emitting status messages), and + stopCommand (which tells it to shut up and die). + + * buildbot/slave/commands.py: make commands interruptible + (ShellCommand.kill): factor out os.kill logic + (Command): factor out setup() + (Command.sendStatus): don't send status if .running is false, this + happens when the command has been halted. + (Command.interrupt): new method, used to tell the command to die + (SlaveShellCommand): implement .interrupt + (DummyCommand): implement .interrupt + (SourceBase, etc): factor out setup(), don't continue substeps if + .interrupted is set + + * buildbot/status/builder.py: fix all waitUntilFinished() methods + so they can be called after finishing + + * buildbot/test/test_run.py: new tests for disconnect behavior, + refactor slave-shutdown routines, add different kinds of + slave-shutdown + +2004-11-27 Brian Warner + + * buildbot/status/words.py (IrcStatusBot.convertTime): utility + method to express ETA time like "2m45s" instead of "165 seconds" + +2004-11-24 Brian Warner + + * buildbot/test/test_vc.py (VC.testArch): unregister the test + archive after the test completes, to avoid cluttering the user's + 'tla archives' listing with a bogus entry. Arch doesn't happen to + provide any way to override the use of ~/.arch-params/, so there + isn't a convenient way to avoid touching the setup of the user who + runs the test. + (VC_HTTP.testArchHTTP): same + +2004-11-23 Brian Warner + + * buildbot/status/html.py (TextLog): split render() up into + render_HEAD and render_GET. Use a Producer when sending log + chunks, to reduce memory requirements and avoid sending huge + non-Banana-able strings over web.distrib connections. Requires + peeking under the covers of IStatusLog. + (TextLog.resumeProducing): fix the "as text" link, handle client + disconnects that occur while we're still sending old chunks. + + * buildbot/status/builder.py (HTMLLogFile.waitUntilFinished): oops, + use defer.succeed, not the non-existent defer.success + (LogFile.waitUntilFinished): same + (LogFile.subscribe): don't add watchers to a finished logfile + + * buildbot/__init__.py (version): bump to 0.6.1+ while between + releases + 2004-11-23 Brian Warner * buildbot/__init__.py (version): Releasing buildbot-0.6.1 Index: NEWS =================================================================== RCS file: /cvsroot/buildbot/site/NEWS,v retrieving revision 1.3 retrieving revision 1.4 diff -u -d -r1.3 -r1.4 --- NEWS 23 Nov 2004 11:41:00 -0000 1.3 +++ NEWS 13 Dec 2004 08:39:35 -0000 1.4 @@ -1,5 +1,71 @@ User visible changes in Buildbot. +* Release 0.6.2 (13 Dec 2004) + +** new features + +It is now possible to interrupt a running build. Both the web page and the +IRC bot feature 'stop build' commands, which can be used to interrupt the +current BuildStep and accelerate the termination of the overall Build. The +status reporting for these still leaves something to be desired (an +'interrupt' event is pushed into the column, and the reason for the interrupt +is added to a pseudo-logfile for the step that was stopped, but if you only +look at the top-level status it appears that the build failed on its own). + +Builds are also halted if the connection to the buildslave is lost. On the +slave side, any active commands are halted if the connection to the +buildmaster is lost. + +** minor new features + +The IRC log bot now reports ETA times in a MMSS format like "2m45s" instead +of the clunky "165 seconds". + +** bug fixes + +*** Slave Disconnect + +Slave disconnects should be handled better now: the current build should be +abandoned properly. Earlier versions could get into weird states where the +build failed to finish, clogging the builder forever (or at least until the +buildmaster was restarted). + +In addition, there are weird network conditions which could cause a +buildslave to attempt to connect twice to the same buildmaster. This can +happen when the slave is sending large logfiles over a slow link, while using +short keepalive timeouts. The buildmaster has been fixed to allow the second +connection attempt to take precedence over the first, so that the older +connection is jettisoned to make way for the newer one. + +In addition, the buildslave has been fixed to be less twitchy about timeouts. +There are now two parameters: keepaliveInterval (which is controlled by the +mktap 'keepalive' argument), and keepaliveTimeout (which requires editing the +.py source to change from the default of 30 seconds). The slave expects to +see *something* from the master at least once every keepaliveInterval +seconds, and will try to provoke a response (by sending a keepalive request) +'keepaliveTimeout' seconds before the end of this interval just in case there +was no regular traffic. Any kind of traffic will qualify, including +acknowledgements of normal build-status updates. + +The net result is that, as long as any given PB message can be sent over the +wire in less than 'keepaliveTimeout' seconds, the slave should not mistakenly +disconnect because of a timeout. There will be traffic on the wire at least +every 'keepaliveInterval' seconds, which is what you want to pay attention to +if you're trying to keep an intervening NAT box from dropping what it thinks +is an abandoned connection. A quiet loss of connection will be detected +within 'keepaliveInterval' seconds. + +*** Large Logfiles + +The web page rendering code has been fixed to deliver large logfiles in +pieces, using a producer/consumer apparatus. This avoids the large spike in +memory consumption when the log file body was linearized into a single string +and then buffered in the socket's application-side transmit buffer. This +should also avoid the 640k single-string limit for web.distrib servers that +could be hit by large (>640k) logfiles. + + + * Release 0.6.1 (23 Nov 2004) ** win32 improvements/bugfixes Index: index.html =================================================================== RCS file: /cvsroot/buildbot/site/index.html,v retrieving revision 1.33 retrieving revision 1.34 diff -u -d -r1.33 -r1.34 --- index.html 6 Dec 2004 07:07:51 -0000 1.33 +++ index.html 13 Dec 2004 08:39:35 -0000 1.34 @@ -11,7 +11,7 @@ Current contents:
      -
    • The current release is buildbot-0.6.1 . You can download the source +
    • The current release is buildbot-0.6.2 . You can download the source from the sf.net download page here. The release is signed with my GPG public key, available Mono project has a buildbot running to test the HEAD and release branches of the main project on several architectures.
    • - + +
    • Justin Mason reports that the SpamAssassin project is running a buildbot too.
    • +
    • install a Buildbot today and get your name added here!
    @@ -114,5 +117,5 @@ align="right" /> -Last modified: Sun Dec 5 23:07:10 PST 2004 +Last modified: Mon Dec 13 00:38:07 PST 2004