From warner at users.sourceforge.net Fri Dec 3 22:54:54 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Fri, 03 Dec 2004 22:54:54 +0000
Subject: [Buildbot-commits] buildbot/buildbot/process base.py,1.43,1.44 builder.py,1.18,1.19 step.py,1.56,1.57
Message-ID:
Update of /cvsroot/buildbot/buildbot/buildbot/process
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot/process
Modified Files:
base.py builder.py step.py
Log Message:
Make commands (and builds) interruptible. Improve lost-slave behavior.
Merging in several days of changes from local Arch branch, see ChangeLog for
details about individual files.
Index: base.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/process/base.py,v
retrieving revision 1.43
retrieving revision 1.44
diff -u -d -r1.43 -r1.44
--- base.py 30 Sep 2004 07:15:35 -0000 1.43
+++ base.py 3 Dec 2004 22:54:51 -0000 1.44
@@ -5,7 +5,7 @@
from twisted.python import log, components
from twisted.python.failure import Failure
-from twisted.internet import reactor, defer
+from twisted.internet import reactor, defer, error
from twisted.spread import pb
import twisted.web.util
@@ -221,8 +221,10 @@
first Step. It returns a Deferred which will fire when the build
finishes."""
+ log.msg("%s.startBuild" % self)
self.build_status = build_status
self.remote = remote
+ self.remote.notifyOnDisconnect(self.lostRemote)
self.deferred = defer.Deferred()
try:
@@ -354,6 +356,8 @@
self.results.append(result)
if text:
self.text.extend(text)
+ if not self.remote:
+ terminate = True
if result == FAILURE:
if step.warnOnFailure:
if self.result != FAILURE:
@@ -371,22 +375,38 @@
self.result = FAILURE
return terminate
+ def lostRemote(self, remote):
+ # the slave went away. There are several possible reasons for this,
+ # and they aren't necessarily fatal. For now, kill the build, but
+ # TODO: see if we can resume the build when it reconnects.
+ log.msg("%s.lostRemote" % self)
+ self.remote = None
+ if self.currentStep:
+ # this should cause the step to finish.
+ log.msg(" stopping currentStep", self.currentStep)
+ self.currentStep.interrupt(Failure(error.ConnectionLost()))
+
def stopBuild(self, reason):
- # the idea here is to let the user cancel a build because, e.g., they
- # realized they committed a bug and they don't want to waste the time
- # building something that they know will fail. Another reason might
- # be to abandon a stuck build. We want to mark the build as failed
- # quickly rather than waiting for it to die on its own.
+ # the idea here is to let the user cancel a build because, e.g.,
+ # they realized they committed a bug and they don't want to waste
+ # the time building something that they know will fail. Another
+ # reason might be to abandon a stuck build. We want to mark the
+ # build as failed quickly rather than waiting for the slave's
+ # timeout to kill it on its own.
log.msg(" %s: stopping build: %s" % (self, reason))
- assert not self.finished
- #self.currentStep.stop(reason)
- # TODO: maybe let its deferred do buildFinished
- if self.currentStep and self.currentStep.progress:
- # XXX: really .fail or something
- self.currentStep.progress.finish()
- text = ["stopped", reason]
- self.buildFinished(text, "red", FAILURE)
+ if self.finished:
+ return
+ # TODO: include 'reason' in this point event
+ self.builder.builder_status.addPointEvent(['interrupt'])
+ self.currentStep.interrupt(reason)
+ if 0:
+ # TODO: maybe let its deferred do buildFinished
+ if self.currentStep and self.currentStep.progress:
+ # XXX: really .fail or something
+ self.currentStep.progress.finish()
+ text = ["stopped", reason]
+ self.buildFinished(text, "red", FAILURE)
def allStepsDone(self):
if self.result == FAILURE:
@@ -419,6 +439,8 @@
abandoned."""
self.finished = True
+ if self.remote:
+ self.remote.dontNotifyOnDisconnect(self.lostRemote)
self.results = results
log.msg(" %s: build finished" % self)
@@ -439,5 +461,9 @@
class BuildControl(components.Adapter):
__implements__ = interfaces.IBuildControl,
+
def getStatus(self):
return self.original.build_status
+
+ def stopBuild(self, reason=""):
+ self.original.stopBuild(reason)
Index: builder.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/process/builder.py,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -d -r1.18 -r1.19
--- builder.py 7 Nov 2004 20:32:47 -0000 1.18
+++ builder.py 3 Dec 2004 22:54:51 -0000 1.19
@@ -1,8 +1,8 @@
#! /usr/bin/python
-from twisted.python import log, components
+from twisted.python import log, components, failure
from twisted.spread import pb
-from twisted.internet import reactor
+from twisted.internet import reactor, defer
from buildbot import interfaces
from buildbot.status.progress import Expectations
@@ -172,6 +172,7 @@
def detached(self):
"""This is called when the connection to the bot is lost."""
+ log.msg("%s.detached" % self)
self.remote = None
reactor.callLater(0, self._detached)
# the current step will be stopped (via a notifyOnDisconnect
@@ -179,6 +180,7 @@
def _detached(self):
if self.currentBuild:
+ log.msg("%s._detached: killing build" % self)
# wasn't enough
self.currentBuild.stopBuild("slave lost")
self.currentBuild = None
@@ -298,7 +300,8 @@
def startBuild(self, build):
log.msg("starting build %s" % build)
- self.remote.callRemote("startBuild") # informational courtesy
+ d = self.remote.callRemote("startBuild") # informational courtesy
+ d.addErrback(self._startBuildFailed, build)
# create the BuildStatus object that goes with the Build
bs = self.builder_status.newBuild()
@@ -309,9 +312,14 @@
# Finally it will start the actual build process.
d = build.startBuild(bs, self.expectations, self.remote)
d.addCallback(self.buildFinished)
+ d.addErrback(log.err)
control = base.BuildControl(build)
return control
+ def _startBuildFailed(self, why, build):
+ log.msg("wanted to start build %s, but "
+ "remote_startBuild failed: %s" % (build, why))
+
def testsFinished(self, results):
# XXX: add build number, datestamp, Change information
#self.testTracker.testsFinished(results)
@@ -365,6 +373,50 @@
self.remote.callRemote("shutdown")
+class Ping:
+ def ping(self, status, remote, timeout):
+ if not remote:
+ status.addPointEvent(["ping", "no slave"], "red")
+ return defer.succeed(False) # interfaces.NoSlaveError
+ self.event = status.addEvent(["pinging"], "yellow")
+ self.active = True
+ self.d = defer.Deferred()
+ d = remote.callRemote("print", "ping")
+ d.addBoth(self._pong)
+
+ # We use either our own timeout or the (long) TCP timeout to detect
+ # silently-missing slaves. This might happen because of a NAT
+ # timeout or a routing loop. If the slave just shuts down (and we
+ # somehow missed the FIN), we should get a "connection refused"
+ # message.
+ self.timer = reactor.callLater(timeout, self.timeout)
+ return self.d
+
+ def timeout(self):
+ self.timer = None
+ self._pong(failure.Failure(interfaces.NoSlaveError("timeout")))
+
+ def _pong(self, res):
+ if not self.active:
+ return
+ self.active = False
+ if self.timer:
+ self.timer.cancel()
+ e = self.event
+ if isinstance(res, failure.Failure):
+ e.text = ["ping", "failed"]
+ e.color = "red"
+ ponged = False
+ # TODO: force the BotPerspective to disconnect, since this
+ # indicates that the bot is unreachable. That will also append a
+ # "disconnect" event to the builder_status, terminating this
+ # "ping failed" event.
+ else:
+ e.text = ["ping", "success"]
+ e.color = "green"
+ ponged = True
+ e.finish()
+ self.d.callback(ponged)
class BuilderControl(components.Adapter):
__implements__ = interfaces.IBuilderControl,
@@ -372,39 +424,15 @@
bc = self.original.forceBuild(who, reason)
return bc
- def ping(self, wait=False):
- status = self.original.builder_status
- if not self.original.remote:
- status.addPointEvent(["ping", "no slave"], "red")
- if wait:
- return defer.fail(interfaces.NoSlaveError())
- else:
- # we rely upon the TCP timeout to detect silently-missing
- # slaves. This might happen because of a NAT timeout or a
- # routing loop. If the slave just shuts down, we should get a
- # "connection refused" message. Of course, in that case we
- # should have gotten one for the connection anyway, but
- # sometimes things get lost.
- e = status.addEvent(["pinging"], "yellow")
- d = self.original.remote.callRemote("print", "ping")
- d.addCallback(self._pong, e)
- d.addErrback(self._pong_failed, e, wait)
- if wait:
- return d
-
- def _pong(self, res, e):
- e.text = ["ping", "success"]
- e.color = "green"
- e.finish()
+ def getBuild(self, number):
+ b = self.original.currentBuild
+ if b and b.build_status.number == number:
+ return base.BuildControl(b)
+ return None
- def _pong_failed(self, why, e, wait):
- e.text = ["ping", "failed"]
- e.color = "red"
- e.finish()
- # TODO: force the BotPerspective to disconnect, since this indicates
- # that the bot is unreachable. That will also append a "disconnect"
- # event to the builder_status, terminating this "ping failed" event.
- if wait:
- raise interfaces.NoSlaveError()
+ def ping(self, timeout=30):
+ d = Ping().ping(self.original.builder_status,
+ self.original.remote, timeout)
+ return d
components.registerAdapter(BuilderControl, Builder, interfaces.IBuilderControl)
Index: step.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/process/step.py,v
retrieving revision 1.56
retrieving revision 1.57
diff -u -d -r1.56 -r1.57
--- step.py 28 Oct 2004 07:25:30 -0000 1.56
+++ step.py 3 Dec 2004 22:54:51 -0000 1.57
@@ -27,8 +27,7 @@
SlaveCommands registered in the buildslave, and self.args to a dictionary
of arguments that will be passed to the SlaveCommand instance.
- start, remoteUpdate, remoteComplete, and remoteFailed are available to be
- overridden
+ start, remoteUpdate, and remoteComplete are available to be overridden
"""
@@ -41,6 +40,14 @@
self.remote_command = remote_command
self.args = args
+ def __getstate__(self):
+ dict = self.__dict__.copy()
+ # Remove the remote ref: if necessary (only for resumed builds), it
+ # will be reattached at resume time
+ if dict.has_key("remote"):
+ del dict["remote"]
+ return dict
+
def run(self, step, remote):
self.active = True
self.step = step
@@ -51,40 +58,40 @@
self.commandID = "%d" % c
log.msg("%s: RemoteCommand.run [%s]" % (self, self.commandID))
self.deferred = defer.Deferred()
+
d = defer.maybeDeferred(self.start)
- d.addErrback(self._remoteFailed) # will catch unknown commands
- return self.deferred
- def __getstate__(self):
- dict = self.__dict__.copy()
- # Remove the remote ref: if necessary (only for resumed builds), it
- # will be reattached at resume time
- if dict.has_key("remote"):
- del dict["remote"]
- return dict
+ # _finished is called with an error for unknown commands, errors
+ # that occur while the command is starting (including OSErrors in
+ # exec()), StaleBroker (when the connection was lost before we
+ # started), and pb.PBConnectionLost (when the slave isn't responding
+ # over this connection, perhaps it had a power failure, or NAT
+ # weirdness). If this happens, self.deferred is fired right away.
+ d.addErrback(self._finished)
+
+ # Connections which are lost while the command is running are caught
+ # when our parent Step calls our .lostRemote() method.
+ return self.deferred
def start(self):
# We will receive remote_update messages as the command runs.
# We will get a single remote_complete when it finishes.
# We should fire self.deferred when the command is done.
- self.remote.notifyOnDisconnect(self.disconnect)
d = self.remote.callRemote("startCommand", self, self.commandID,
self.remote_command, self.args)
return d
- def disconnect(self, broker):
- # lost the slave: fail the command
- log.msg("RemoteCommand.disconnect: lost slave", self)
- self.active = False
- self._remoteFailed(Failure(error.ConnectionLost()))
+ def interrupt(self, why):
+ if isinstance(why, Failure) and why.check(error.ConnectionLost):
+ log.msg("RemoteCommand.disconnect: lost slave", self)
+ self.remote = None
+ self._finished(Failure(error.ConnectionLost()))
+ return
- def todo_stop(self):
# tell the remote command to halt. Returns a Deferred that will fire
- # when the command has been stopped, or will errback if the slave is
- # unreachable.
+ # when the interrupt command has been delivered.
d = defer.maybeDeferred(self.remote.callRemote,
- "stopCommand", self, self.commandID,
- reason)
+ "interruptCommand", self.commandID, why)
return d
def remote_update(self, updates):
@@ -96,7 +103,7 @@
self.remoteUpdate(update)
except:
# log failure, terminate build, let slave retire the update
- self.failed(Failure())
+ self._finished(Failure())
# TODO: what if multiple updates arrive? should
# skip the rest but ack them all
if num > max_updatenum:
@@ -109,51 +116,45 @@
def remote_complete(self, failure=None):
# call the real remoteComplete a moment later, but first return an
# acknowledgement so the slave can retire the completion message.
- self.remote.dontNotifyOnDisconnect(self.disconnect)
if self.active:
- reactor.callLater(0, self._remoteComplete, failure)
+ reactor.callLater(0, self._finished, failure)
return None
- def _remoteComplete(self, failure):
- if failure:
- return self._remoteFailed(failure)
- try:
- self.remoteComplete()
- except:
- # log the failure and terminate the step
- log.msg("remoteComplete had exception")
- return self.failed(Failure())
- self.finished()
-
- def _remoteFailed(self, failure):
- log.msg("RemoteCommand._remoteFailed")
- try:
- self.remote.dontNotifyOnDisconnect(self.disconnect)
- except ValueError:
- # TODO: make this cleaner but keep it safe
- pass # probably already removed it in remote_complete.
- try:
- self.remoteFailed(failure)
- except:
- log.msg("RemoteCommand.remoteFailed failed")
- log.err()
- return self.failed(failure)
+ def _finished(self, failure=None):
+ self.active = False
+ # call .remoteComplete. If it raises an exception, or returns the
+ # Failure that we gave it, our self.deferred will be errbacked. If
+ # it does not (either it ate the Failure or there the step finished
+ # normally and it didn't raise a new exception), self.deferred will
+ # be callbacked.
+ d = defer.maybeDeferred(self.remoteComplete, failure)
+ # arrange for the callback to get this RemoteCommand instance
+ # instead of just None
+ d.addCallback(lambda r: self)
+ d.addBoth(self.deferred.callback)
- def remoteComplete(self):
- # subclasses should interpret status as they like and do cleanup
- pass
+ def remoteComplete(self, maybeFailure):
+ """Subclasses can override this.
- def remoteFailed(self, why):
- # subclasses should do any cleanup (like closing log files) here
- pass
+ This is called when the RemoteCommand has finished. 'maybeFailure'
+ will be None if the command completed normally, or a Failure
+ instance in one of the following situations:
- def finished(self):
- self.active = False
- self.deferred.callback(self)
+ # the slave was lost before the command was started
+ # the slave didn't respond to the startCommand message
+ # the slave raised an exception while starting the command
+ # (bad command name, bad args, OSError from missing executable)
+ # the slave raised an exception while finishing the command
+ # (they send back a remote_complete message with a Failure payload)
+ # and also (for now):
+ # slave disconnected while the command was running
+
+ This method should do cleanup, like closing log files. It should
+ normally return the 'failure' argument, so that any exceptions will
+ be propagated to the Step. If it wants to consume them, return None
+ instead."""
- def failed(self, why):
- self.active = False
- self.deferred.errback(why)
+ return failure
class LoggedRemoteCommand(RemoteCommand):
"""This is a RemoteCommand which expects the slave to send back
@@ -203,15 +204,14 @@
log.msg("%s rc=%s" % (self, rc))
self.addHeader("program finished with exit code %d\n" % rc)
- def remoteComplete(self):
- if self.closeWhenFinished:
- log.msg("closing log")
- self.log.finish()
-
- def remoteFailed(self, why):
+ def remoteComplete(self, maybeFailure):
if self.closeWhenFinished:
- self.addHeader("\nremoteFailed: %s" % why)
+ if maybeFailure:
+ self.addHeader("\nremoteFailed: %s" % maybeFailure)
+ else:
+ log.msg("closing log")
self.log.finish()
+ return maybeFailure
class RemoteShellCommand(LoggedRemoteCommand):
"""This class helps you run a shell command on the build slave. It will
@@ -426,6 +426,15 @@
raise NotImplementedError("your subclass must implement this method")
+ def interrupt(self, reason):
+ """Halt the command, either because the user has decided to cancel
+ the build ('reason' is a string), or because the slave has
+ disconnected ('reason' is a ConnectionLost Failure). Any further
+ local processing should be skipped, and the Step completed with an
+ error status. The results text should say something useful like
+ ['step', 'interrupted'] or ['remote', 'lost']"""
+ pass
+
def finished(self, results):
if self.progress:
self.progress.finish()
@@ -563,9 +572,24 @@
self.cmd.useLog(loog, True)
loog.logProgressTo(self.progress, "output")
d = self.runCommand(self.cmd)
- d.addCallback(self._commandComplete)
+ d.addCallbacks(self._commandComplete, self.checkDisconnect)
d.addErrback(self.failed)
+ def interrupt(self, reason):
+ # TODO: consider adding an INTERRUPTED or STOPPED status to use
+ # instead of FAILURE, might make the text a bit more clear
+ self.addCompleteLog('interrupt', reason)
+ d = self.cmd.interrupt(reason)
+ return d
+
+ def checkDisconnect(self, f):
+ f.trap(error.ConnectionLost)
+ self.step_status.setColor("red")
+ self.step_status.setText(self.describe(True) +
+ ["failed", "slave", "lost"])
+ self.step_status.setText2(["failed", "slave", "lost"])
+ return self.finished(FAILURE)
+
def _commandComplete(self, cmd):
self.commandComplete(cmd)
self.createSummary(cmd.log)
@@ -1106,33 +1130,44 @@
@param timeout: the number of seconds to delay
"""
+ haltOnFailure = True
name = "dummy"
def __init__(self, timeout=5, **kwargs):
BuildStep.__init__(self, **kwargs)
self.timeout = timeout
+ self.timer = None
+
def start(self):
self.step_status.setColor("yellow")
self.step_status.setText(["delay", "%s secs" % self.timeout])
- reactor.callLater(self.timeout, self._done)
- def _done(self):
+ self.timer = reactor.callLater(self.timeout, self.done)
+
+ def interrupt(self, reason):
+ if self.timer:
+ self.timer.cancel()
+ self.timer = None
+ self.step_status.setColor("red")
+ self.step_status.setText(["delay", "interrupted"])
+ self.finished(FAILURE)
+
+ def done(self):
self.step_status.setColor("green")
self.finished(SUCCESS)
-class FailingDummy(BuildStep):
+class FailingDummy(Dummy):
"""I am a dummy step that raises an Exception after 5 seconds
@param timeout: the number of seconds to delay
"""
name = "failing dummy"
- def __init__(self, timeout=5, **kwargs):
- BuildStep.__init__(self, **kwargs)
- self.timeout = timeout
+
def start(self):
self.step_status.setColor("yellow")
self.step_status.setText(["boom", "%s secs" % self.timeout])
- reactor.callLater(self.timeout, self.boom)
- def boom(self):
+ self.timer = reactor.callLater(self.timeout, self.done)
+
+ def done(self):
class Boom(Exception):
pass
try:
@@ -1150,6 +1185,7 @@
"""
name = "remote dummy"
+
def __init__(self, timeout=5, **kwargs):
BuildStep.__init__(self, **kwargs)
args = {'timeout': timeout}
From warner at users.sourceforge.net Fri Dec 3 22:54:55 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Fri, 03 Dec 2004 22:54:55 +0000
Subject: [Buildbot-commits] buildbot/buildbot/status builder.py,1.45,1.46 html.py,1.46,1.47
Message-ID:
Update of /cvsroot/buildbot/buildbot/buildbot/status
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot/status
Modified Files:
builder.py html.py
Log Message:
Make commands (and builds) interruptible. Improve lost-slave behavior.
Merging in several days of changes from local Arch branch, see ChangeLog for
details about individual files.
Index: builder.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/status/builder.py,v
retrieving revision 1.45
retrieving revision 1.46
diff -u -d -r1.45 -r1.46
--- builder.py 24 Nov 2004 02:41:22 -0000 1.45
+++ builder.py 3 Dec 2004 22:54:52 -0000 1.46
@@ -340,8 +340,11 @@
return (self.finished is not None)
def waitUntilFinished(self):
- d = defer.Deferred()
- self.finishedWatchers.append(d)
+ if self.finished:
+ d = defer.succeed(self)
+ else:
+ d = defer.Deferred()
+ self.finishedWatchers.append(d)
return d
# while the step is running, the following methods make sense.
@@ -580,8 +583,11 @@
return (self.finished is not None)
def waitUntilFinished(self):
- d = defer.Deferred()
- self.finishedWatchers.append(d)
+ if self.finished:
+ d = defer.succeed(self)
+ else:
+ d = defer.Deferred()
+ self.finishedWatchers.append(d)
return d
# while the build is running, the following methods make sense.
@@ -1365,7 +1371,7 @@
return self.botmaster.builders[name].builder_status
def getSlave(self, slavename):
- return self.botmaster.slaveStatus[slavename]
+ return self.botmaster.slaves[slavename].slave_status
def subscribe(self, target):
self.watchers.append(target)
Index: html.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/status/html.py,v
retrieving revision 1.46
retrieving revision 1.47
diff -u -d -r1.46 -r1.47
--- html.py 24 Nov 2004 03:35:05 -0000 1.46
+++ html.py 3 Dec 2004 22:54:52 -0000 1.47
@@ -5,10 +5,11 @@
from twisted.python import log, components
import urllib
+from twisted.internet import defer, reactor
from twisted.web.resource import Resource
from twisted.web import static, html, server, distrib
from twisted.web.error import NoResource
-from twisted.web.util import Redirect
+from twisted.web.util import Redirect, DeferredResource
from twisted.application import service, internet
from twisted.spread import pb
@@ -35,6 +36,23 @@
class IHTMLLog(components.Interface):
pass
+ROW_TEMPLATE = '''
+
+ %(label)s
+ %(field)s
+
'''
+
+def make_row(label, field):
+ """Create a name/value row for the HTML.
+
+ `label` is plain text; it will be HTML-encoded.
+
+ `field` is a bit of HTML structure; it will not be encoded in
+ any way.
+ """
+ label = html.escape(label)
+ return ROW_TEMPLATE % {"label": label, "field": field}
+
colormap = {
'green': '#72ff75',
}
@@ -227,9 +245,10 @@
class StatusResourceBuild(HtmlResource):
title = "Build"
- def __init__(self, build):
+ def __init__(self, build, control):
HtmlResource.__init__(self)
self.build = build
+ self.control = control
def body(self, request):
b = self.build
@@ -246,6 +265,19 @@
data += "
"
+ if self.control is not None:
+ stopURL = urllib.quote(request.childLink("stop"))
+ data += """
+
+ """
data += ("
Blamelist:
\n"
" \n")
@@ -262,10 +294,31 @@
#data += html.PRE(b.changesText()) # TODO
return data
+ def stop(self, request):
+ log.msg("web stopBuild of build %s:%s" % \
+ (self.build.getBuilder().getName(),
+ self.build.getNumber()))
+ name = request.args.get("username", [""])[0]
+ comments = request.args.get("comments", [""])[0]
+ reason = ("The web-page 'stop build' button was pressed by "
+ "'%s': %s\n" % (name, comments))
+ self.control.stopBuild(reason)
+ # we're at http://localhost:8080/svn-hello/builds/5/stop?[args] and
+ # we want to go to: http://localhost:8080/svn-hello/builds/5 or
+ # http://localhost:8080/
+ #
+ #return Redirect("../%d" % self.build.getNumber())
+ r = Redirect("../../..")
+ d = defer.Deferred()
+ reactor.callLater(1, d.callback, r)
+ return DeferredResource(d)
+
def getChild(self, path, request):
if path == "tests":
# TODO: this will collide with a step named 'tests'
return StatusResourceTestResults(self.build.getTestResults())
+ if path == "stop":
+ return self.stop(request)
stepname = path
steps = self.build.getSteps()
for s in steps:
@@ -285,17 +338,14 @@
def body(self, request):
b = self.builder
slave = b.getSlave()
- data = self.make_row("Builder:",
- html.escape(b.getName()))
+ data = make_row("Builder:", html.escape(b.getName()))
b1 = b.getBuild(-1)
if b1 is not None:
- data += self.make_row("Current/last build:",
- str(b1.getNumber()))
+ data += make_row("Current/last build:", str(b1.getNumber()))
if slave.isConnected():
data += "\nCONNECTED (slave '%s') \n" % slave.getName()
if slave.getAdmin():
- data += self.make_row("Admin:",
- html.escape(slave.getAdmin()))
+ data += make_row("Admin:", html.escape(slave.getAdmin()))
if slave.getHost():
data += "Host info:\n"
data += html.PRE(slave.getHost())
@@ -309,10 +359,10 @@
@@ -334,23 +384,6 @@
return data
- def make_row(self, label, field):
- """Create a name/value row for the HTML.
-
- `label` is plain text; it will be HTML-encoded.
-
- `field` is a bit of HTML structure; it will not be encoded in
- any way.
- """
- label = html.escape(label)
- return self.ROW_TEMPLATE % {"label": label, "field": field}
-
- ROW_TEMPLATE = '''
-
- %(label)s
- %(field)s
-
'''
-
def force(self, request):
name = request.args.get("username", [""])[0]
reason = request.args.get("comments", [""])[0]
@@ -378,6 +411,8 @@
return Redirect("..")
def getChild(self, path, request):
+ log.msg('path=%s, postpath=%s, prepath=%s' % (path, request.postpath,
+ request.prepath))
if path == "force":
return self.force(request)
if path == "ping":
@@ -407,7 +442,10 @@
if path == "builds":
build = self.builder.getBuild(num)
if build:
- return StatusResourceBuild(build)
+ control = None
+ if self.control:
+ control = self.control.getBuild(num)
+ return StatusResourceBuild(build, control)
else:
return NoResource("No such build '%d'" % num)
return NoResource("really weird URL %s" % path)
From warner at users.sourceforge.net Fri Dec 3 22:54:55 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Fri, 03 Dec 2004 22:54:55 +0000
Subject: [Buildbot-commits] buildbot/buildbot/test test_run.py,1.18,1.19
Message-ID:
Update of /cvsroot/buildbot/buildbot/buildbot/test
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot/test
Modified Files:
test_run.py
Log Message:
Make commands (and builds) interruptible. Improve lost-slave behavior.
Merging in several days of changes from local Arch branch, see ChangeLog for
details about individual files.
Index: test_run.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/test/test_run.py,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -d -r1.18 -r1.19
--- test_run.py 30 Sep 2004 07:13:32 -0000 1.18
+++ test_run.py 3 Dec 2004 22:54:53 -0000 1.19
@@ -4,7 +4,7 @@
dr = unittest.deferredResult
from twisted.internet import reactor, defer
from twisted.python import log
-import sys, os, shutil
+import sys, os, shutil, time
#log.startLogging(sys.stderr)
from buildbot import master, interfaces
@@ -44,6 +44,12 @@
BuildmasterConfig = c
"""
+class MyBot(bot.Bot):
+ def remote_getSlaveInfo(self):
+ return self.parent.info
+class MyBuildSlave(bot.BuildSlave):
+ botClass = MyBot
+
class STarget:
__implements__ = interfaces.IStatusReceiver,
debug = False
@@ -119,33 +125,97 @@
# now kill the timer
b1.waiting.stopTimer()
-class Status(unittest.TestCase):
+class RunMixin:
master = None
slave = None
+ slave2 = None
def setUp(self):
shutil.rmtree("basedir", ignore_errors=1)
+ shutil.rmtree("slavebase", ignore_errors=1)
+ shutil.rmtree("slavebase2", ignore_errors=1)
os.mkdir("basedir")
self.master = master.BuildMaster("basedir")
def connectSlave(self):
port = self.master.slavePort._port.getHost().port
os.mkdir("slavebase")
- slave = bot.BuildSlave("localhost", port, "bot1", "sekrit",
- "slavebase", keepalive=0, usePTY=1)
+ slave = MyBuildSlave("localhost", port, "bot1", "sekrit",
+ "slavebase", keepalive=0, usePTY=1)
+ slave.info = {"admin": "one"}
self.slave = slave
slave.startService()
d = self.master.botmaster.waitUntilBuilderAttached("dummy")
dr(d)
+ def connectSlave2(self):
+ port = self.master.slavePort._port.getHost().port
+ os.mkdir("slavebase2")
+ slave = MyBuildSlave("localhost", port, "bot1", "sekrit",
+ "slavebase2", keepalive=0, usePTY=1)
+ slave.info = {"admin": "two"}
+ self.slave2 = slave
+ slave.startService()
+
def tearDown(self):
+ log.msg("doing tearDown")
+ self.shutdownSlave()
+ if self.master:
+ dr(defer.maybeDeferred(self.master.stopService))
+ self.master = None
+
+ # various forms of slave death
+
+ def shutdownSlave(self, waitForMasterToo=True):
+ # the slave has disconnected normally: they SIGINT'ed it, or it shut
+ # down willingly. This will kill child processes and give them a
+ # chance to finish up.
if self.slave:
- d = self.master.botmaster.waitUntilBuilderDetached("dummy")
+ d = self.slave.waitUntilDisconnected()
dr(defer.maybeDeferred(self.slave.stopService))
dr(d)
- if self.master:
- dr(defer.maybeDeferred(self.master.stopService))
-
+ self.slave = None
+ if self.slave2:
+ d = self.slave2.waitUntilDisconnected()
+ dr(defer.maybeDeferred(self.slave2.stopService))
+ dr(d)
+ self.slave2 = None
+ if waitForMasterToo:
+ d = self.master.botmaster.waitUntilBuilderDetached("dummy")
+ dr(d)
+
+ def killSlave(self):
+ # the slave has died, its host sent a FIN. The .notifyOnDisconnect
+ # callbacks will terminate the current step, so the build should be
+ # flunked (no further steps should be started).
+ self.slave.bf.continueTrying = 0
+ bot = self.slave.getServiceNamed("bot")
+ broker = bot.builders["dummy"].remote.broker
+ broker.transport.loseConnection()
+ self.slave = None
+
+ def disappearSlave(self):
+ # the slave's host has vanished off the net, leaving the connection
+ # dangling. This will be detected quickly by app-level keepalives or
+ # a ping, or slowly by TCP timeouts.
+
+ # implement this by replacing the slave Broker's .dataReceived method
+ # with one that just throws away all data.
+ def discard(data):
+ pass
+ bot = self.slave.getServiceNamed("bot")
+ broker = bot.builders["dummy"].remote.broker
+ broker.dataReceived = discard # seal its ears
+ broker.transport.write = discard # and take away its voice
+
+ def ghostSlave(self):
+ # the slave thinks it has lost the connection, and initiated a
+ # reconnect. The master doesn't yet realize it has lost the previous
+ # connection, and sees two connections at once.
+ raise NotImplementedError
+
+class Status(RunMixin, unittest.TestCase):
+
def testSlave(self):
m = self.master
s = m.getStatus()
@@ -269,3 +339,166 @@
res = dr(d)
self.failUnless(3.0 < t4.eta_build < 5.0) # should be 4 seconds
+
+class Disconnect(RunMixin, unittest.TestCase):
+
+ def disconnectSetup(self):
+ # verify that disconnecting the slave during a build properly
+ # terminates the build
+ m = self.master
+ s = m.getStatus()
+ c = interfaces.IControl(m)
+
+ m.loadConfig(config_2)
+ m.readConfig = True
+ m.startService()
+
+ self.failUnlessEqual(s.getBuilderNames(), ["dummy"])
+ s1 = s.getBuilder("dummy")
+ self.failUnlessEqual(s1.getName(), "dummy")
+ self.failUnlessEqual(s1.getState(), ("offline", None, None))
+ self.failUnlessEqual(s1.getCurrentBuild(), None)
+ self.failUnlessEqual(s1.getLastFinishedBuild(), None)
+ self.failUnlessEqual(s1.getBuild(-1), None)
+
+ self.connectSlave()
+ self.failUnlessEqual(s1.getState(), ("idle", None, None))
+ return m,s,c,s1
+
+ def verifyDisconnect(self, bs):
+ self.failUnless(bs.isFinished())
+
+ step1 = bs.getSteps()[0]
+ self.failUnlessEqual(step1.getText(), ["delay", "interrupted"])
+ self.failUnlessEqual(step1.getResults()[0], builder.FAILURE)
+
+ self.failUnlessEqual(bs.getResults(), builder.FAILURE)
+
+
+ def testIdle1(self):
+ m,s,c,s1 = self.disconnectSetup()
+ # disconnect the slave before the build starts
+ self.shutdownSlave() # dies before it gets started
+
+ # trying to force a build now will cause an error. Regular builds
+ # just wait for the slave to re-appear, but forced builds that
+ # cannot be run right away trigger NoSlaveErrors
+ fb = c.getBuilder("dummy").forceBuild
+ self.failUnlessRaises(interfaces.NoSlaveError,
+ fb, None, "forced build")
+
+ def testIdle2(self):
+ m,s,c,s1 = self.disconnectSetup()
+ # now suppose the slave goes missing
+ self.disappearSlave()
+
+ # forcing a build will work: the build will begin, since we think we
+ # have a slave. The build will fail, however, because of a timeout
+ # error.
+ bc = c.getBuilder("dummy").forceBuild(None, "forced build")
+ bs = bc.getStatus()
+ print "build started"
+ d = bs.waitUntilFinished()
+ dr(d, 5)
+ print bs.getText()
+ testIdle2.skip = "short timeout not yet implemented"
+
+ def testBuild1(self):
+ m,s,c,s1 = self.disconnectSetup()
+ # this next sequence is timing-dependent. The dummy build takes at
+ # least 3 seconds to complete, and this batch of commands must
+ # complete within that time.
+ #
+ bc = c.getBuilder("dummy").forceBuild(None, "forced build")
+ bs = bc.getStatus()
+ # kill the slave while it's running the first step
+ self.shutdownSlave() # dies before it gets started
+
+ # now examine the just-stopped build and make sure it is really
+ # stopped. This is checking for bugs in which the slave-detach gets
+ # missed or causes an exception which prevents the build from being
+ # marked as "finished due to an error".
+ d = bs.waitUntilFinished()
+ dr(d, 5)
+
+ self.failUnlessEqual(s1.getState()[0], "offline")
+ self.verifyDisconnect(bs)
+
+ def testBuild2(self):
+ m,s,c,s1 = self.disconnectSetup()
+ # this next sequence is timing-dependent
+ bc = c.getBuilder("dummy").forceBuild(None, "forced build")
+ bs = bc.getStatus()
+ # shutdown the slave while it's running the first step
+ reactor.callLater(0.5, self.shutdownSlave)
+
+ dr(bs.waitUntilFinished(), 5)
+
+ self.failUnlessEqual(s1.getState()[0], "offline")
+ self.verifyDisconnect(bs)
+
+ def testBuild3(self):
+ m,s,c,s1 = self.disconnectSetup()
+ # this next sequence is timing-dependent
+ bc = c.getBuilder("dummy").forceBuild(None, "forced build")
+ bs = bc.getStatus()
+ # kill the slave while it's running the first step
+ reactor.callLater(0.5, self.killSlave)
+
+ dr(bs.waitUntilFinished(), 5)
+
+ self.failUnlessEqual(s1.getState()[0], "offline")
+ self.verifyDisconnect(bs)
+
+ def testInterrupt(self):
+ m,s,c,s1 = self.disconnectSetup()
+ # this next sequence is timing-dependent
+ bc = c.getBuilder("dummy").forceBuild(None, "forced build")
+ bs = bc.getStatus()
+ # halt the build while it's running the first step
+ reactor.callLater(0.5, bc.stopBuild, "bang go splat")
+
+ dr(bs.waitUntilFinished(), 5)
+
+ self.verifyDisconnect(bs)
+
+ def testDisappear(self):
+ m,s,c,s1 = self.disconnectSetup()
+ bc = c.getBuilder("dummy")
+
+ # ping should succeed
+ d = bc.ping(1)
+ res = dr(d)
+ self.failUnlessEqual(res, True)
+
+ # now, before any build is run, make the slave disappear
+ self.slave.bf.continueTrying = 0
+ self.disappearSlave()
+
+ # at this point, a ping to the slave should timeout
+ d = bc.ping(1)
+ res = dr(d)
+ self.failUnlessEqual(res, False)
+
+ def testDuplicate(self):
+ m,s,c,s1 = self.disconnectSetup()
+ bc = c.getBuilder("dummy")
+ bs = s.getBuilder("dummy")
+ ss = bs.getSlave()
+
+ self.failUnless(ss.isConnected())
+ self.failUnlessEqual(ss.getAdmin(), "one")
+
+ # now, before any build is run, make the first slave disappear
+ self.slave.bf.continueTrying = 0
+ self.disappearSlave()
+
+ d = self.master.botmaster.waitUntilBuilderDetached("dummy")
+ # now let the new slave take over
+ self.connectSlave2()
+ dr(d, 2)
+ d = self.master.botmaster.waitUntilBuilderAttached("dummy")
+ dr(d, 2)
+
+ self.failUnless(ss.isConnected())
+ self.failUnlessEqual(ss.getAdmin(), "two")
From warner at users.sourceforge.net Fri Dec 3 22:54:54 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Fri, 03 Dec 2004 22:54:54 +0000
Subject: [Buildbot-commits] buildbot/buildbot/slave interfaces.py,NONE,1.1 bot.py,1.3,1.4 commands.py,1.16,1.17
Message-ID:
Update of /cvsroot/buildbot/buildbot/buildbot/slave
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot/slave
Modified Files:
bot.py commands.py
Added Files:
interfaces.py
Log Message:
Make commands (and builds) interruptible. Improve lost-slave behavior.
Merging in several days of changes from local Arch branch, see ChangeLog for
details about individual files.
Index: bot.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/bot.py,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- bot.py 30 Aug 2004 22:15:23 -0000 1.3
+++ bot.py 3 Dec 2004 22:54:52 -0000 1.4
@@ -35,21 +35,20 @@
def __init__(self, builder):
self.builder = builder
-class SlaveBuilder(pb.Referenceable):
+class SlaveBuilder(pb.Referenceable, service.Service):
"""This is the local representation of a single Builder: it handles a
single kind of build (like an all-warnings build). It has a name and a
home directory. The rest of its behavior is determined by the master.
"""
-
+
+ stopCommandOnShutdown = True
+
def __init__(self, parent, name, builddir, not_really):
+ #service.Service.__init__(self)
self.name = name
- self.bot = parent
self.builddir = builddir
self.not_really = not_really
- self.basedir = os.path.join(self.bot.basedir, builddir)
- if not os.path.isdir(self.basedir):
- os.mkdir(self.basedir)
self.remote = None
# remote is a ref to the Builder object on the master side, and is
# set when they attach. It really isn't used very much
@@ -74,6 +73,20 @@
def __repr__(self):
return "" % self.name
+
+ def setServiceParent(self, parent):
+ service.Service.setServiceParent(self, parent)
+ self.bot = self.parent
+ self.basedir = os.path.join(self.bot.basedir, self.builddir)
+ if not os.path.isdir(self.basedir):
+ os.mkdir(self.basedir)
+
+ def stopService(self):
+ service.Service.stopService(self)
+ if self.command and self.stopCommandOnShutdown:
+ self.stopCommand()
+ self.command = None
+
def remote_setMaster(self, remote):
self.remote = remote
self.remote.notifyOnDisconnect(self.lostRemote)
@@ -82,14 +95,13 @@
def lostRemote(self, remote):
log.msg("lost remote")
- if self.remote != remote:
- print "WEIRD: lost the wrong remote"
self.remote = None
+
def lostRemoteStep(self, remotestep):
log.msg("lost remote step")
- if self.remoteStep != remotestep:
- print "WEIRD: lost the wrong remote step"
self.remoteStep = None
+ if self.command and self.stopCommandOnShutdown:
+ self.stopCommand()
# the following are Commands that can be invoked by the master-side
# Builder
@@ -105,7 +117,7 @@
if self.command:
log.msg("leftover command, dropping it")
- #self.stopCommand()
+ self.stopCommand()
self.command = None
try:
@@ -120,25 +132,49 @@
self.remoteStep.notifyOnDisconnect(self.lostRemoteStep)
self.updateNum = 0
self.complete = None
+ self.command.running = True
d = defer.maybeDeferred(self.command.start)
d.addCallbacks(self.commandComplete, self.commandFailed)
return None
- # the following are invoked by the Commands we spawn
+ def remote_interruptCommand(self, stepId, why):
+ """Halt the current step."""
+ log.msg("asked to interrupt current command: %s" % why)
+ if not self.command:
+ # TODO: just log it, a race could result in their interrupting a
+ # command that wasn't actually running
+ log.msg(" .. but none was running")
+ return
+ self.command.interrupt()
+
+ def stopCommand(self):
+ if not self.command:
+ return
+ self.command.running = False
+ if not self.command.interrupted:
+ self.command.interrupt()
+
+
+ # these two are fired by the Deferred attached to each Command
def commandComplete(self, dummy):
+ if not self.running:
+ return
self.sendComplete()
def commandFailed(self, why):
+ if not self.running:
+ return
log.msg("commandFailed")
log.err(why)
self.sendComplete(why)
-
- # these are utility routines used by sendStatus and commandComplete
+ # sendUpdate is invoked by the Commands we spawn
def sendUpdate(self, data=None):
"""This sends the status update to the master-side BuildStep object,
giving it a sequence number in the process. It adds the update to
a queue, and asks the master to acknowledge the update so it can be
removed from that queue."""
+ if not self.running:
+ return
self.updateNum += 1
update = [data, self.updateNum]
#log.msg("sendUpdate", update)
@@ -149,6 +185,8 @@
d.addCallback(self.ackUpdate)
d.addErrback(self._ackFailed, "SlaveBuilder.sendUpdate")
+ # these are utility routines used by sendStatus and commandComplete
+
def dummy(self, value):
pass
@@ -218,10 +256,6 @@
self.sendAllUpdates()
self.sendAllCompletes()
- def stopCommand(self):
- if self.command:
- self.command.interrupt()
- self.command = None
def finishCommand(self):
log.msg("SlaveBuilder.finishCommand", self.command)
self.remoteStep.dontNotifyOnDisconnect(self.lostRemoteStep)
@@ -233,10 +267,12 @@
reactor.stop()
-class Bot(pb.Referenceable):
+class Bot(pb.Referenceable, service.MultiService):
usePTY = None
+ name = "bot"
def __init__(self, basedir, usePTY, not_really=0):
+ service.MultiService.__init__(self)
self.basedir = basedir
self.usePTY = usePTY
self.not_really = not_really
@@ -263,11 +299,13 @@
else:
b = SlaveBuilder(self, name, builddir, self.not_really)
b.usePTY = self.usePTY
+ b.setServiceParent(self)
self.builders[name] = b
retval[name] = b
for name in self.builders.keys():
if not name in map(lambda a: a[0], wanted):
log.msg("removing old builder %s" % name)
+ self.builder[name].disownServiceParent()
del(self.builders[name])
return retval
@@ -370,18 +408,33 @@
self.keepaliveTimer = None
-class BuildSlave(internet.TCPClient):
+class BuildSlave(service.MultiService):
+ botClass = Bot
+
def __init__(self, host, port, name, passwd, basedir, keepalive,
usePTY):
- bot = Bot(basedir, usePTY)
+ service.MultiService.__init__(self)
+ bot = self.botClass(basedir, usePTY)
+ bot.setServiceParent(self)
bf = self.bf = BotFactory(keepalive)
bf.startLogin(credentials.UsernamePassword(name, passwd), client=bot)
- internet.TCPClient.__init__(self, host, port, bf)
+ self.connection = c = internet.TCPClient(host, port, bf)
+ c.setServiceParent(self)
+
+ def waitUntilDisconnected(self):
+ # utility method for testing. Returns a Deferred that will fire when
+ # we lose the connection to the master.
+ if not self.bf.perspective:
+ return defer.succeed(None)
+ d = defer.Deferred()
+ self.bf.perspective.notifyOnDisconnect(lambda res: d.callback(None))
+ return d
def stopService(self):
self.bf.continueTrying = 0
- internet.TCPClient.stopService(self)
- return self._connection.disconnect()
+ service.MultiService.stopService(self)
+ # now kill the TCP connection
+ self.connection._connection.disconnect()
class Options(usage.Options):
synopsis = "Usage: mktap buildbot slave --name --passwd [options]"
Index: commands.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/commands.py,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -d -r1.16 -r1.17
--- commands.py 28 Oct 2004 07:27:08 -0000 1.16
+++ commands.py 3 Dec 2004 22:54:52 -0000 1.17
@@ -6,10 +6,14 @@
from twisted.internet import reactor, defer
from twisted.python import log, failure, runtime
+from buildbot.slave.interfaces import ISlaveCommand
from buildbot.slave.registry import registerSlaveCommand
cvs_ver = '$Revision$'[1+len("Revision: "):-2]
+# version history:
+# >=1.17: commands are interruptable
+
class CommandInterrupted(Exception):
pass
class TimeoutError(Exception):
@@ -206,11 +210,13 @@
def doTimeout(self):
msg = "command timed out: %d seconds without output" % self.timeout
+ self.kill(msg)
+
+ def kill(self, msg):
msg += ", killing pid %d" % self.process.pid
log.msg(msg)
self.sendStatus({'header': "\n" + msg + "\n"})
- # TODO: nicer way is: SIGTERM, wait, SIGKILL, wait, freak
hit = 0
if runtime.platformType == "posix":
try:
@@ -226,14 +232,15 @@
# probably no-such-process, maybe because there is no process
# group
pass
- try:
- log.msg("trying process.signalProcess('KILL')")
- self.process.signalProcess('KILL')
- log.msg(" successful")
- hit = 1
- except OSError:
- # could be no-such-process, because they finished very recently
- pass
+ if not hit:
+ try:
+ log.msg("trying process.signalProcess('KILL')")
+ self.process.signalProcess('KILL')
+ log.msg(" successful")
+ hit = 1
+ except OSError:
+ # could be no-such-process, because they finished very recently
+ pass
if not hit:
log.msg("signalProcess/os.kill failed both times")
# finished ought to be called momentarily
@@ -248,21 +255,17 @@
# be raised as pp tries to send status through .command
self.commandFailed(TimeoutError("SIGKILL failed to kill process"))
- def interrupt(self):
- log.msg("interrupting process", self.process)
- self.process.signalProcess('KILL')
- # some stdout/stderr may be lost, along with the exit code
-
class Command:
+ __implements__ = ISlaveCommand,
- """This class defines one command that can be invoked by the build
- master. The command is executed on the slave side, and always sends back
- a completion message when it finishes. It may also send intermediate
- status as it runs (by calling builder.sendStatus). Some commands can be
- interrupted (either by the build master or a local SIGINT), in which
- case the status message indicates the step failed to complete because of
- an interruption.
+ """This class defines one command that can be invoked by the build master.
+ The command is executed on the slave side, and always sends back a
+ completion message when it finishes. It may also send intermediate status
+ as it runs (by calling builder.sendStatus). Some commands can be
+ interrupted (either by the build master or a local timeout), in which
+ case the step is expected to complete normally with a status message that
+ indicates an error occurred.
These commands are used by BuildSteps on the master side. Each kind of
BuildStep uses a single Command. The slave must implement all the
@@ -274,46 +277,75 @@
where 'builder' is the parent SlaveBuilder object, and 'args' is a
dict that is interpreted per-command.
+ The setup(args) method is available for setup, and is run from __init__.
+
The Command is started with start(). This method must be implemented in a
subclass, and it should return a Deferred. When your step is done, you
should fire the Deferred (the results are not used). If the command is
- interrupted, it should errback with a CommandInterrupted failure.
+ interrupted, it should fire the Deferred anyway.
- The status messages all carry a dict, which is interpreted by the
- master-side BuildStep however it likes. The completion message only
- specifies whether the command was interrupted or not. If the Command
- needs to return an exit code of some sort, that should be sent as a
- regular status message before the completion message is sent. Once
- builder.commandComplete has been run, no more status messages may be
- sent."""
+ While the command runs. it may send status messages back to the
+ buildmaster by calling self.sendStatus(statusdict). The statusdict is
+ interpreted by the master-side BuildStep however it likes.
+
+ A separate completion message is sent when the deferred fires, which
+ indicates that the Command has finished, but does not carry any status
+ data. If the Command needs to return an exit code of some sort, that
+ should be sent as a regular status message before the deferred is fired .
+ Once builder.commandComplete has been run, no more status messages may be
+ sent.
+
+ If interrupt() is called, the Command should attempt to shut down as
+ quickly as possible. Child processes should be killed, new ones should
+ not be started. The Command should send some kind of error status update,
+ then complete as usual by firing the Deferred.
+
+ .interrupted should be set by interrupt(), and can be tested to avoid
+ sending multiple error status messages.
+
+ If .running is False, the bot is shutting down (or has otherwise lost the
+ connection to the master), and should not send any status messages. This
+ is taken care of in Command.sendStatus .
+
+ """
# builder methods:
# sendStatus(dict) (zero or more)
# commandComplete() or commandInterrupted() (one, at end)
debug = False
+ interrupted = False
+ running = False # set by Builder, cleared on shutdown or when the
+ # Deferred fires
def __init__(self, builder, stepId, args):
self.builder = builder
self.stepId = stepId # just for logging
self.args = args
+ self.setup(args)
+ def setup(self, args):
+ """Override this in a subclass to extract items from the args dict."""
+ pass
+
def start(self):
# should return a Deferred
raise NotImplementedError, "You must implement this in a subclass"
def sendStatus(self, status):
"""Send a status update to the master."""
- if self.debug: log.msg("sendStatus", status)
+ if self.debug:
+ log.msg("sendStatus", status)
+ if not self.running:
+ log.msg("would sendStatus but not .running")
+ return
self.builder.sendUpdate(status)
- def NOTinterrupt(self):
- """Stop the command. You must implement this in a subclass, then
- call this parent method when it is done. After this is called, no
- further status messages may be sent."""
- self.builder = None # make sure we stop sending messages
- self.interrupted = 1
- self.deferred.errback(failure.Failure(CommandInterrupted()))
+ def interrupt(self):
+ """Override this in a subclass to allow commands to be interrupted.
+ May be called multiple times, use self.interrupted=True if this
+ matters."""
+ pass
def _abandonOnFailure(self, rc):
if type(rc) is not int:
@@ -375,6 +407,11 @@
d = self.command.start()
return d
+ def interrupt(self):
+ self.interrupted = True
+ self.command.kill("command interrupted")
+
+
registerSlaveCommand("shell", SlaveShellCommand, cvs_ver)
@@ -382,18 +419,27 @@
def start(self):
self.d = defer.Deferred()
log.msg(" starting dummy command [%s]" % self.stepId)
- reactor.callLater(1, self.doStatus)
+ self.timer = reactor.callLater(1, self.doStatus)
return self.d
+ def interrupt(self):
+ self.timer.cancel()
+ self.timer = None
+ self.interrupted = True
+ self.finished()
+
def doStatus(self):
log.msg(" sending intermediate status")
self.sendStatus({'stdout': 'data'})
timeout = self.args.get('timeout', 5) + 1
- reactor.callLater(timeout - 1, self.finished)
+ self.timer = reactor.callLater(timeout - 1, self.finished)
def finished(self):
log.msg(" dummy command finished [%s]" % self.stepId)
- self.sendStatus({'rc': 0})
+ if self.interrupted:
+ self.sendStatus({'rc': 1})
+ else:
+ self.sendStatus({'rc': 0})
self.d.callback(0)
registerSlaveCommand("dummy", DummyCommand, cvs_ver)
@@ -426,21 +472,19 @@
"""
- def __init__(self, builder, stepId, args):
- Command.__init__(self, builder, stepId, args)
+ def setup(self, args):
self.workdir = args['workdir']
self.mode = args.get('mode', "update")
self.revision = args.get('revision')
self.patch = args.get('patch')
self.timeout = args.get('timeout', 120)
- self.setup(args)
-
- def setup(self, args):
- """Override this in the VC-specific subclass to extract more args"""
- pass
+ # VC-specific subclasses should override this to extract more args.
+ # Make sure to upcall!
def start(self):
self.sendStatus({'header': "starting " + self.header + "\n"})
+ self.command = None
+
# self.srcdir is where the VC system should put the sources
if self.mode == "copy":
self.srcdir = "source" # hardwired directory name, sorry
@@ -466,6 +510,11 @@
d.addCallbacks(self._sendRC, self._checkAbandoned)
return d
+ def interrupt(self):
+ self.interrupted = True
+ if self.command:
+ self.command.kill("command interrupted")
+
def doVC(self, res):
if self.sourcedirIsUpdateable():
d = self.doVCUpdate()
@@ -484,6 +533,8 @@
def maybeDoVCFallback(self, rc):
if type(rc) is int and rc == 0:
return rc
+ if self.interrupted:
+ raise AbandonChain(1)
msg = "update failed, clobbering and trying again"
self.sendStatus({'header': msg + "\n"})
log.msg(msg)
@@ -577,6 +628,7 @@
header = "cvs operation"
def setup(self, args):
+ SourceBase.setup(self, args)
self.cvsroot = args['cvsroot']
self.cvsmodule = args['cvsmodule']
self.global_options = args.get('global_options', [])
@@ -651,6 +703,7 @@
header = "svn operation"
def setup(self, args):
+ SourceBase.setup(self, args)
self.svnurl = args['svnurl']
def sourcedirIsUpdateable(self):
@@ -695,6 +748,7 @@
header = "darcs operation"
def setup(self, args):
+ SourceBase.setup(self, args)
self.repourl = args['repourl']
def sourcedirIsUpdateable(self):
@@ -741,6 +795,7 @@
buildconfig = None
def setup(self, args):
+ SourceBase.setup(self, args)
self.url = args['url']
self.version = args['version']
@@ -825,6 +880,7 @@
header = "p4 sync"
def setup(self, args):
+ SourceBase.setup(self, args)
self.p4port = args['p4port']
def sourcedirIsUpdateable(self):
--- NEW FILE: interfaces.py ---
#! /usr/bin/python
from twisted.python.components import Interface
class ISlaveCommand(Interface):
"""This interface is implemented by all of the buildslave's Command
subclasses. It specifies how the buildslave can start, interrupt, and
query the various Commands running on behalf of the buildmaster."""
def __init__(builder, stepId, args):
"""Create the Command. 'builder' is a reference to the parent
buildbot.bot.SlaveBuilder instance, which will be used to send status
updates (by calling builder.sendStatus). 'stepId' is a random string
which helps correlate slave logs with the master. 'args' is a dict of
arguments that comes from the master-side BuildStep, with contents
that are specific to the individual Command subclass.
This method is not intended to be subclassed."""
def setup(args):
"""This method is provided for subclasses to override, to extract
parameters from the 'args' dictionary. The default implemention does
nothing. It will be called from __init__"""
def start():
"""Begin the command, and return a Deferred.
While the command runs, it should send status updates to the
master-side BuildStep by calling self.sendStatus(status). The
'status' argument is typically a dict with keys like 'stdout',
'stderr', and 'rc'.
When the step completes, it should fire the Deferred (the results are
not used). If an exception occurs during execution, it may also
errback the deferred, however any reasonable errors should be trapped
and indicated with a non-zero 'rc' status rather than raising an
exception. Exceptions should indicate problems within the buildbot
itself, not problems in the project being tested.
"""
def interrupt():
"""This is called to tell the Command that the build is being stopped
and therefore the command should be terminated as quickly as
possible. The command may continue to send status updates, up to and
including an 'rc' end-of-command update (which should indicate an
error condition). The Command's deferred should still be fired when
the command has finally completed.
If the build is being stopped because the slave it shutting down or
because the connection to the buildmaster has been lost, the status
updates will simply be discarded. The Command does not need to be
aware of this.
Child shell processes should be killed. Simple ShellCommand classes
can just insert a header line indicating that the process will be
killed, then os.kill() the child."""
From warner at users.sourceforge.net Fri Dec 3 22:54:53 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Fri, 03 Dec 2004 22:54:53 +0000
Subject: [Buildbot-commits] buildbot/buildbot interfaces.py,1.20,1.21 master.py,1.54,1.55
Message-ID:
Update of /cvsroot/buildbot/buildbot/buildbot
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004/buildbot
Modified Files:
interfaces.py master.py
Log Message:
Make commands (and builds) interruptible. Improve lost-slave behavior.
Merging in several days of changes from local Arch branch, see ChangeLog for
details about individual files.
Index: interfaces.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/interfaces.py,v
retrieving revision 1.20
retrieving revision 1.21
diff -u -d -r1.20 -r1.21
--- interfaces.py 15 Oct 2004 16:59:44 -0000 1.20
+++ interfaces.py 3 Dec 2004 22:54:50 -0000 1.21
@@ -603,8 +603,18 @@
further control the new build, or from which an IBuildStatus object
can be obtained."""
- def ping():
- """Attempt to contact the slave and see if it is still alive."""
+ def getBuild(number):
+ """Attempt to return an IBuildControl object for the given build.
+ Returns None if no such object is available. This will only work for
+ the build that is currently in progress: once the build finishes,
+ there is nothing to control anymore."""
+
+ def ping(timeout=30):
+ """Attempt to contact the slave and see if it is still alive. This
+ returns a Deferred which fires with either True (the slave is still
+ alive) or False (the slave did not respond). As a side effect, adds
+ an event to this builder's column in the waterfall display
+ containing the results of the ping."""
# TODO: this ought to live in ISlaveControl, maybe with disconnect()
# or something. However the event that is emitted is most useful in
# the Builder column, so it kinda fits here too.
@@ -612,3 +622,7 @@
class IBuildControl(Interface):
def getStatus():
"""Return an IBuildStatus object for the Build that I control."""
+ def stopBuild(reason=""):
+ """Halt the build. This has no effect if the build has already
+ finished."""
+
Index: master.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/master.py,v
retrieving revision 1.54
retrieving revision 1.55
diff -u -d -r1.54 -r1.55
--- master.py 14 Oct 2004 16:47:33 -0000 1.54
+++ master.py 3 Dec 2004 22:54:50 -0000 1.55
@@ -38,28 +38,30 @@
class BotPerspective(NewCredPerspective):
"""This is the master-side representative for a remote buildbot slave.
- When buildbots connect in, they get a reference to a new instance of
- this class. The BotMaster object is stashed as the .service
- attribute."""
+ There is exactly one for each slave described in the config file (the
+ c['bots'] list). When buildbots connect in (.attach), they get a
+ reference to this instance. The BotMaster object is stashed as the
+ .service attribute."""
slave_commands = None
- def __init__(self, slavename, builders, slave_status):
- self.slavename = slavename
- self.builders = builders
- self.slave_status = slave_status
+ def __init__(self, name):
+ self.slavename = name
+ self.slave_status = SlaveStatus(name)
+ self.builders = [] # list of b.p.builder.Builder instances
+ self.slave = None # a RemoteReference to the Bot, when connected
def addBuilder(self, builder):
"""Called to add a builder after the slave has connected."""
self.builders.append(builder)
- # TODO: resync with slave, to accomodate builders added after
- # attach
- self.sendBuilderList()
+ if self.slave:
+ self.sendBuilderList()
def removeBuilder(self, builder):
self.builders.remove(builder)
- builder.detached()
- self.sendBuilderList()
+ if self.slave:
+ builder.detached()
+ self.sendBuilderList()
def __repr__(self):
return "" % \
@@ -67,14 +69,78 @@
string.join(map(lambda b: b.name, self.builders), ','))
def attached(self, mind):
- # this is called shortly after the slave connects. We go through a
- # sequence of calls, gathering information, then tell our Builders
- # that they have a slave to work with.
+ # this is called when the slave connects. It returns a Deferred that
+ # fires with a suitable pb.IPerspective to give to the slave (i.e.
+ # 'self')
+
+ if self.slave:
+ # uh-oh, we've got a duplicate slave. The most likely
+ # explanation is that the slave is behind a slow link, thinks we
+ # went away, and has attempted to reconnect, so we've got two
+ # "connections" from the same slave, but the previous one is
+ # stale. Give the new one precedence.
+ log.msg("duplicate slave %s replacing old one" % self.slavename)
+
+ # just in case we've got two identically-configured slaves,
+ # report the IP addresses of both so someone can resolve the
+ # squabble
+ tport = self.slave.broker.transport
+ log.msg("old slave was connected from", tport.getPeer())
+ log.msg("new slave is from", mind.broker.transport.getPeer())
+ d = self.disconnect()
+ d.addCallback(lambda res: self._attached(mind))
+ return d
+
+ self._attached(mind)
+ return defer.succeed(self)
+
+ def disconnect(self):
+ log.msg("disconnecting old slave %s now" % self.slavename)
+
+ # all kinds of teardown will happen as a result of
+ # loseConnection(), but it happens after a reactor iteration or
+ # two. Hook the actual disconnect so we can know when it is safe
+ # to connect the new slave. We have to wait one additional
+ # iteration (with callLater(0)) to make sure the *other*
+ # notifyOnDisconnect handlers have had a chance to run.
+ d = defer.Deferred()
+
+ self.slave.notifyOnDisconnect(lambda res: # TODO: d=d ?
+ reactor.callLater(0, d.callback, None))
+ tport = self.slave.broker.transport
+ # this is the polite way to request that a socket be closed
+ tport.loseConnection()
+ try:
+ # but really we don't want to wait for the transmit queue to
+ # drain. The remote end is unlikely to ACK the data, so we'd
+ # probably have to wait for a (20-minute) TCP timeout.
+ #tport._closeSocket()
+ # however, doing _closeSocket (whether before or after
+ # loseConnection) somehow prevents the notifyOnDisconnect
+ # handlers from being run. Bummer.
+ tport.offset = 0
+ tport.dataBuffer = ""
+ pass
+ except:
+ # however, these hacks are pretty internal, so don't blow up if
+ # they fail or are unavailable
+ log.msg("failed to accelerate the shutdown process")
+ pass
+ log.msg("waiting for slave to finish disconnecting")
+
+ # When this Deferred fires, we'll be ready to accept the new slave
+ return d
+
+ def _attached(self, mind):
+ # We go through a sequence of calls, gathering information, then
+ # tell our Builders that they have a slave to work with.
self.slave = mind
self.slave.callRemote("print", "attached").addErrback(lambda why: 0)
self.slave_status.connected = True
log.msg("bot attached")
+ # TODO: there is a window here (while we're retrieving slaveinfo)
+ # during which a disconnect or a duplicate-slave will be confusing
d = self.slave.callRemote("getSlaveInfo")
d.addCallback(self.got_info)
d.addErrback(self.infoUnavailable)
@@ -149,13 +215,16 @@
for name, remote in blist.items():
for b in self.builders:
if b.name == name:
+ # if we sent the builders list because of a config
+ # change, the Builder might already be attached.
+ # Builder.attached will ignore us if this happens.
b.attached(remote, self.slave_commands)
continue
def _listFailed(self, why):
log.msg("BotPerspective._listFailed")
log.err(why)
- # TODO: hand up on them, without setBuilderList we can't use them
+ # TODO: hang up on them, without setBuilderList we can't use them
def perspective_forceBuild(self, name, who=None):
# slave admins are allowed to force any of their own builds
@@ -176,13 +245,11 @@
pass
def detached(self, mind):
+ self.slave = None
self.slave_status.connected = False
- self.botmaster.detach(self)
for b in self.builders:
b.detached()
- self.builders = []
- log.msg("bot detached")
- # this perspective goes away now
+ log.msg("Botmaster.detached(%s)" % self.slavename)
class BotMaster(service.Service):
@@ -204,8 +271,7 @@
# which is the master-side object that defines and controls a build.
# They are added by calling botmaster.addBuilder() from the startup
# code.
- self.slaves = {}
- self.slaveStatus = {}
+ self.slaves = {} # maps slavename to BotPerspective
self.interlocks = {}
self.statusClientService = None
self.watchers = {}
@@ -220,10 +286,20 @@
def waitUntilBuilderDetached(self, name):
# convenience function for testing
d = defer.Deferred()
- b = self.builders[name]
+ b = self.builders.get(name, None)
+ if not b or not b.remote:
+ return defer.succeed(None)
b.watchers['detach'].append(d)
return d
+ def addSlave(self, slavename):
+ slave = BotPerspective(slavename)
+ self.slaves[slavename] = slave
+
+ def removeSlave(self, slavename):
+ d = self.slaves[slavename].disconnect
+ del self.slaves[slavename]
+
def getBuildernames(self):
return self.builderNames
@@ -233,24 +309,26 @@
that build: the builds cannot be done until the right slave
connects."""
if self.debug: print "addBuilder", builder
+ log.msg("Botmaster.addBuilder(%s)" % builder.name)
+
if builder.name in self.builderNames:
- raise KeyError, "muliply defined builder '%s'" % builder.name
+ raise KeyError("muliply defined builder '%s'" % builder.name)
+ slavename = builder.slavename
+ if not self.slaves.has_key(slavename):
+ raise KeyError("builder %s uses undefined slave %s" % \
+ (builder.name, slavename))
+
self.builders[builder.name] = builder
self.builderNames.append(builder.name)
builder.setBotmaster(self)
self.checkInactiveInterlocks() # TODO?: do this in caller instead?
- if not self.slaveStatus.has_key(builder.slavename):
- # this is a new slave, create a SlaveStatus object for it
- s = SlaveStatus(builder.slavename)
- self.slaveStatus[builder.slavename] = s
- slave = self.slaves.get(builder.slavename)
- if slave:
- # there is an active slave which needs to be informed about the
- # new builder
- slave.addBuilder(builder)
+
+ slave = self.slaves[slavename]
+ slave.addBuilder(builder)
def removeBuilder(self, builder):
if self.debug: print "removeBuilder", builder
+ log.msg("Botmaster.removeBuilder(%s)" % builder.name)
b = self.builders[builder.name]
# any linked interlocks will be made inactive before the builder is
# removed
@@ -266,18 +344,9 @@
i.deactivate(self.builders)
del self.builders[builder.name]
self.builderNames.remove(builder.name)
- # check for an active slave to remove the builder from
- for slavename, slave in self.slaves.items():
- if slavename == builder.slavename:
- slave.removeBuilder(builder)
- # now see if this was the last builder to use the slave
- used = False
- for b in self.builders.values():
- if b.slavename == builder.slavename:
- used = True
- break
- if not used:
- del self.slaveStatus[builder.slavename]
+ slave = self.slaves.get(builder.slavename)
+ if slave:
+ slave.removeBuilder(builder)
def addInterlock(self, interlock):
"""This is called by the setup code to create build interlocks:
@@ -312,40 +381,7 @@
interlock.deactivate(self.builders)
def getPerspective(self, slavename):
- if self.slaves.has_key(slavename):
- # uh-oh, we've got a duplicate slave. Try to figure out where the
- # old one is coming from so we can explain the problem
- log.msg("duplicate slave %s trying to connect" % slavename)
- addr = self.slaves[slavename].slave.broker.transport.getPeer()
- log.msg("old slave is connected from", addr)
- # unfortunately the slave doesn't currently emit this message
- raise ValueError("duplicate slave, old one connected from %s" \
- % addr)
-
- slave_status = self.slaveStatus.get(slavename)
- if not slave_status:
- # TODO: this is probably broken w.r.t slaves connecting before
- # their builders have been configured, or vice versa
- slave_status = SlaveStatus(slavename)
- self.slaveStatus[slavename] = slave_status
- slave_status.connected = True
-
- # Find all the builders that want to use this slave
- builders = [b for (name, b) in self.builders.items()
- if b.slavename == slavename]
- p = BotPerspective(slavename, builders, slave_status)
- p.botmaster = self
- self.slaves[slavename] = p
- return p
-
- def detach(self, p):
- if not self.slaves[p.slavename] == p:
- # TODO: I saw this happen, but I don't know why
- log.msg("WEIRD, wrong slave '%s' saying goodbye" % p.slavename)
- log.msg(" original:", self.slaves[p.slavename])
- log.msg(" detaching:", p)
- self.slaveStatus[p.slavename].connected = False
- del self.slaves[p.slavename]
+ return self.slaves[slavename]
def addChange(self, change):
for b in self.builders.values():
@@ -459,6 +495,8 @@
def requestAvatar(self, avatarID, mind, interface):
assert interface == pb.IPerspective
+ log.msg("requestAvatar(%s) from %s" % \
+ (avatarID, mind.broker.transport.getPeer()))
afactory = self.names.get(avatarID)
if afactory:
p = afactory.getPerspective()
@@ -474,14 +512,14 @@
p = self.botmaster.getPerspective(avatarID)
if not p:
- raise ValueError, "no perspective for '%s'" % avatarID
- p.attached(mind) # perhaps .callLater(0) ?
- # TODO: especially for BotPerspectives
- # TODO: the slave might be removed from BotMaster.slaves by the time
- # the .detached callback is run, causing the assert in
- # BotMaster.detach to fail
- return (pb.IPerspective, p,
- lambda p=p,mind=mind: p.detached(mind))
+ raise ValueError("no perspective for '%s'" % avatarID)
+
+ d = defer.maybeDeferred(p.attached, mind)
+ d.addCallback(self._avatarAttached, mind)
+ return d
+
+ def _avatarAttached(self, p, mind):
+ return (pb.IPerspective, p, lambda p=p,mind=mind: p.detached(mind))
########################################
@@ -537,6 +575,7 @@
self.statusTargets = []
+ self.bots = []
self.sources = []
self.readConfig = False
@@ -689,6 +728,12 @@
raise TypeError, "webPortnum '%s' must be an int" % webPortnum
for s in status:
assert interfaces.IStatusReceiver(s)
+ if 0: # tuple-specified builders are a problem
+ slavenames = [name for name,pw in bots]
+ for b in builders:
+ if b['slavename'] not in slavenames:
+ raise ValueError("builder %s uses undefined slave %s" \
+ % (b['name'], b['slavename']))
# now we're committed to implementing the new configuration, so do
# it atomically
@@ -700,13 +745,7 @@
# self.bots: Disconnect any that were attached and removed from the
# list. Update self.checker with the new list of passwords,
# including debug/change/status.
- self.checker.users = {} # violates abstraction, oh well
- for user, passwd in bots:
- self.checker.addUser(user, passwd)
- self.checker.addUser("change", "changepw")
-
- # TODO: hang up on old bots
- self.bots = bots
+ self.loadConfig_Slaves(bots)
# self.debugPassword
if debugPassword:
@@ -728,15 +767,7 @@
self.manhole = manhole
manhole.setServiceParent(self)
- # self.sources: shut down any that were removed, start any that were
- # added
- old = self.sources
- new = sources
- [self.change_svc.removeSource(source)
- for source in old if source not in new]
- [self.change_svc.addSource(source)
- for source in new if source not in old]
- self.sources = sources
+ self.loadConfig_Sources(sources)
# add/remove self.botmaster.builders to match builders. The
# botmaster will handle startup/shutdown issues.
@@ -761,6 +792,35 @@
log.msg("configuration updated")
+ def loadConfig_Slaves(self, bots):
+ # set up the Checker with the names and passwords of all valid bots
+ self.checker.users = {} # violates abstraction, oh well
+ for user, passwd in bots:
+ self.checker.addUser(user, passwd)
+ self.checker.addUser("change", "changepw")
+
+ # identify new/old bots
+ old = self.bots; oldnames = [name for name,pw in old]
+ new = bots; newnames = [name for name,pw in new]
+ # removeSlave will hang up on the old bot
+ [self.botmaster.removeSlave(name)
+ for name in oldnames if name not in newnames]
+ [self.botmaster.addSlave(name)
+ for name in newnames if name not in oldnames]
+
+ # all done
+ self.bots = bots
+
+ def loadConfig_Sources(self, sources):
+ # shut down any that were removed, start any that were added
+ old = self.sources
+ new = sources
+ [self.change_svc.removeSource(source)
+ for source in old if source not in new]
+ [self.change_svc.addSource(source)
+ for source in new if source not in old]
+ self.sources = sources
+
def loadConfig_Builders(self, newBuilders):
old = self.botmaster.getBuildernames()
newNames = []
From warner at users.sourceforge.net Fri Dec 3 22:54:52 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Fri, 03 Dec 2004 22:54:52 +0000
Subject: [Buildbot-commits] buildbot ChangeLog,1.319,1.320
Message-ID:
Update of /cvsroot/buildbot/buildbot
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv20004
Modified Files:
ChangeLog
Log Message:
Make commands (and builds) interruptible. Improve lost-slave behavior.
Merging in several days of changes from local Arch branch, see ChangeLog for
details about individual files.
Index: ChangeLog
===================================================================
RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v
retrieving revision 1.319
retrieving revision 1.320
diff -u -d -r1.319 -r1.320
--- ChangeLog 28 Nov 2004 01:53:52 -0000 1.319
+++ ChangeLog 3 Dec 2004 22:54:50 -0000 1.320
@@ -1,3 +1,140 @@
+2004-12-03 Brian Warner
+
+ * buildbot/master.py: clean up slave-handling code, to handle
+ slave-disconnect and multiple-connect better
+ (BotPerspective): make these long-lasting, exactly one per bot
+ listed in the config file.
+ (BotPerspective.attached): if a slave connects while an existing
+ one appears to still be connected, disconnect the old one first.
+ (BotPerspective.disconnect): new method to forcibly disconnect a
+ buildslave. Use some hacks to empty the transmit buffer quickly to
+ avoid the long (20-min?) TCP timeout that could occur if the old
+ slave has dropped off the net.
+ (BotMaster): Keep persistent BotPerspectives in .slaves, let them
+ own their own SlaveStatus objects. Remove .attached/.detached, add
+ .addSlave/.removeSlave, treat slaves like Builders (config file
+ parsing sends deltas to the BotMaster). Inform the slave
+ instances, i.e. the BotPerspective, about addBuilder and
+ removeBuilder.
+ (BotMaster.getPerspective): turns into a single dict lookup
+ (Dispatcher.requestAvatar): allow .attached to return a Deferred,
+ which gives BotPerspective.attached a chance to disconnect the old
+ slave first.
+ (BuildMaster.loadConfig): add code (disabled) to validate that all
+ builders use known slaves (listed in c['bots']). The check won't
+ work with tuple-specified builders, which are deprecated but not
+ yet invalid, so the check is disabled for now.
+ (BuildMaster.loadConfig_Slaves): move slave-config into a separate
+ routine, do the add/changed/removed dance with them like we do
+ with builders.
+ (BuildMaster.loadConfig_Sources): move source-config into a
+ separate routine too
+
+ * buildbot/status/builder.py (Status.getSlave): get the
+ SlaveStatus object from the BotPerspective, not the BotMaster.
+
+ * buildbot/test/test_run.py: bunch of new tests for losing the
+ buildslave at various points in the build, handling a slave that
+ connects multiple times, and making sure we can interrupt a
+ running build
+
+ * buildbot/slave/bot.py (BuildSlave): make it possible to use
+ something other than 'Bot' for the Bot object, to make certain
+ test cases easier to write.
+ (BuildSlave.waitUntilDisconnected): utility method for testing
+
+2004-11-30 Brian Warner
+
+ * buildbot/test/test_run.py (RunMixin): refactor, remove debug msg
+
+ * buildbot/interfaces.py (IBuilderControl.ping): add timeout=
+ argument, return a Deferred that always fires with True or False.
+ I don't use an errback to indicate 'ping failed' so that callers
+ are free to ignore the deferred without causing spurious errors in
+ the logs.
+ * buildbot/process/builder.py (BuilderControl.ping): implement it
+
+ * buildbot/test/test_run.py (Status.testDisappear): test ping
+ (Status.disappearSlave): fix it
+
+2004-11-30 Brian Warner
+
+ * buildbot/interfaces.py (IBuildControl): add .stopBuild
+ (IBuilderControl): add .getBuild(num), only works for the current
+ build, of course, although it might be interesting to offer
+ something for builds in the .waiting or .interlocked state.
+
+ * buildbot/process/base.py (Build): have .stopBuild just do the
+ interrupt, then let the build die by itself.
+ (BuildControl): add .stopBuild, and add a point-event named
+ 'interrupt' just after the build so status viewers can tell that
+ someone killed it.
+ (BuilderControl): add .getBuild
+
+ * buildbot/process/step.py (Dummy): use haltOnFailure so it really
+ stops when you kill it, good for testing
+ (ShellCommand.interrupt): add a logfile named 'interrupt' which
+ contains the 'reason' text.
+
+ * buildbot/status/html.py: Add Stop Build button, if the build can
+ still be stopped. Send a Redirect (to the top page) one second
+ later, hopefully long enough for the interrupt to have an effect.
+ Move make_row() up to top-level to share it between Stop Build and
+ Force Build.
+
+ * buildbot/slave/commands.py: only kill the child process once
+
+ * buildbot/test/test_run.py: add testInterrupt
+
+2004-11-29 Brian Warner
+
+ * buildbot/process/base.py: Refactor command interruption. The
+ Build is now responsible for noticing that the slave has gone
+ away: Build.lostRemote() interrupts the current step and makes
+ sure that no further ones will be started.
+
+ * buildbot/process/builder.py: When the initial remote_startBuild
+ message fails, log it: this usually indicates that the slave has
+ gone away, but we don't really start paying attention until they
+ fail to respond to the first step's command.
+
+ * buildbot/process/step.py (RemoteCommand): Does *not* watch for
+ slave disconnect. Now sports a new interrupt() method. Error
+ handling was simplified a lot by chaining deferreds, so
+ remoteFailed/remoteComplete were merged into a single
+ remoteComplete method (which can now get a Failure object).
+ Likewise failed/finished were merged into just _finished.
+ (BuildStep): Add interrupt(why) method, and if why is a
+ ConnectionLost Failure then the step is failed with some useful
+ error text.
+
+ * buildbot/slave/bot.py: stop the current command when the remote
+ Step reference is lost, and when the slave is shut down.
+ (Bot): make it a MultiService, so it can have children. Use
+ stopService to tell when the slave is shutting down.
+ (SlaveBuilder): make it a Service, and a child of the Bot. Add
+ remote_interruptCommand (which asks the current SlaveCommand to
+ stop but allows it to keep emitting status messages), and
+ stopCommand (which tells it to shut up and die).
+
+ * buildbot/slave/commands.py: make commands interruptible
+ (ShellCommand.kill): factor out os.kill logic
+ (Command): factor out setup()
+ (Command.sendStatus): don't send status if .running is false, this
+ happens when the command has been halted.
+ (Command.interrupt): new method, used to tell the command to die
+ (SlaveShellCommand): implement .interrupt
+ (DummyCommand): implement .interrupt
+ (SourceBase, etc): factor out setup(), don't continue substeps if
+ .interrupted is set
+
+ * buildbot/status/builder.py: fix all waitUntilFinished() methods
+ so they can be called after finishing
+
+ * buildbot/test/test_run.py: new tests for disconnect behavior,
+ refactor slave-shutdown routines, add different kinds of
+ slave-shutdown
+
2004-11-27 Brian Warner
* buildbot/status/words.py (IrcStatusBot.convertTime): utility
From warner at users.sourceforge.net Sat Dec 4 21:02:06 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Sat, 04 Dec 2004 21:02:06 +0000
Subject: [Buildbot-commits] buildbot/buildbot/slave commands.py,1.17,1.18 bot.py,1.4,1.5
Message-ID:
Update of /cvsroot/buildbot/buildbot/buildbot/slave
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv14277/buildbot/slave
Modified Files:
commands.py bot.py
Log Message:
* buildbot/slave/bot.py: clean up shutdown/lose-master code
(SlaveBuilder): make some attributes class-level, remove the old
"update queue" which existed to support resuming a build after the
master connection was lost. Try to reimplement that feature later.
(SlaveBuilder.stopCommand): clear self.command when the
SlaveCommand finishes, so that we don't try to kill a leftover one
at shutdown time.
(SlaveBuilder.commandComplete): same, merge with commandFailed and
.finishCommand
* buildbot/slave/commands.py (SourceBase): set self.command for
all VC commands, so they can be interrupted.
Index: bot.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/bot.py,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- bot.py 3 Dec 2004 22:54:52 -0000 1.4
+++ bot.py 4 Dec 2004 21:02:03 -0000 1.5
@@ -44,32 +44,27 @@
stopCommandOnShutdown = True
+ # remote is a ref to the Builder object on the master side, and is set
+ # when they attach. We use it to detect when the connection to the master
+ # is severed.
+ remote = None
+
+ # .build points to a SlaveBuild object, a new one for each build
+ build = None
+
+ # .command points to a SlaveCommand instance, and is set while the step
+ # is running. We use it to implement the stopBuild method.
+ command = None
+
+ # .remoteStep is a ref to the master-side BuildStep object, and is set
+ # when the step is started
+ remoteStep = None
+
def __init__(self, parent, name, builddir, not_really):
#service.Service.__init__(self)
self.name = name
self.builddir = builddir
self.not_really = not_really
- self.remote = None
- # remote is a ref to the Builder object on the master side, and is
- # set when they attach. It really isn't used very much
- self.build = None
- # .build points to a SlaveBuild object, a new one for each build
- self.command = None
- # .command points to a SlaveCommand instance, and is set when the
- # step is started. It remains until the step is completed and the
- # completion is acknowledged (via .ackComplete)
- self.remoteStep = None
- # .remoteStep is a ref to the master-side BuildStep object, and is
- # set when the step is started
- self.updateNum = None
- # .updateNum counts status updates. It is reset to zero at the
- # beginning of each step. Numbering the updates makes it possible to
- # reattach to a master that has been restarted
- self.updateQueue = []
- # unacknowledged status updates are kept in .updateQueue, and are
- # removed by .ackUpdate
- self.complete = None
- # an unacknowledged completion message lives in .complete
def __repr__(self):
return "" % self.name
@@ -83,9 +78,8 @@
def stopService(self):
service.Service.stopService(self)
- if self.command and self.stopCommandOnShutdown:
+ if self.stopCommandOnShutdown:
self.stopCommand()
- self.command = None
def remote_setMaster(self, remote):
self.remote = remote
@@ -100,7 +94,7 @@
def lostRemoteStep(self, remotestep):
log.msg("lost remote step")
self.remoteStep = None
- if self.command and self.stopCommandOnShutdown:
+ if self.stopCommandOnShutdown:
self.stopCommand()
# the following are Commands that can be invoked by the master-side
@@ -111,6 +105,7 @@
one step to the next."""
self.build = SlaveBuild(self)
log.msg("startBuild")
+
def remote_startCommand(self, stepref, stepId, command, args):
"""This is called multiple times by various master-side BuildSteps,
to start various commands that actually do the build."""
@@ -118,7 +113,6 @@
if self.command:
log.msg("leftover command, dropping it")
self.stopCommand()
- self.command = None
try:
factory, version = registry.commandRegistry[command]
@@ -127,14 +121,12 @@
self.command = factory(self, stepId, args)
log.msg(" startCommand:%s [id %s]" % (command,stepId))
- self.updateQueue = []
self.remoteStep = stepref
self.remoteStep.notifyOnDisconnect(self.lostRemoteStep)
- self.updateNum = 0
- self.complete = None
self.command.running = True
d = defer.maybeDeferred(self.command.start)
- d.addCallbacks(self.commandComplete, self.commandFailed)
+ d.addCallback(lambda res: None)
+ d.addBoth(self.commandComplete)
return None
def remote_interruptCommand(self, stepId, why):
@@ -147,120 +139,76 @@
return
self.command.interrupt()
+
def stopCommand(self):
+ """Make any currently-running command die, with no further status
+ output. This is used when the buildslave is shutting down or the
+ connection to the master has been lost. Interrupt the command,
+ silence it, and then forget about it."""
if not self.command:
return
- self.command.running = False
- if not self.command.interrupted:
- self.command.interrupt()
-
+ log.msg("stopCommand: halting current command %s" % self.command)
+ self.command.running = False # shut up!
+ self.command.interrupt() # die!
+ self.command = None # forget you!
- # these two are fired by the Deferred attached to each Command
- def commandComplete(self, dummy):
- if not self.running:
- return
- self.sendComplete()
- def commandFailed(self, why):
- if not self.running:
- return
- log.msg("commandFailed")
- log.err(why)
- self.sendComplete(why)
# sendUpdate is invoked by the Commands we spawn
- def sendUpdate(self, data=None):
+ def sendUpdate(self, data):
"""This sends the status update to the master-side BuildStep object,
giving it a sequence number in the process. It adds the update to
a queue, and asks the master to acknowledge the update so it can be
removed from that queue."""
if not self.running:
+ # .running comes from service.Service, and says whether the
+ # service is running or not. If we aren't running, don't send any
+ # status messages.
return
- self.updateNum += 1
- update = [data, self.updateNum]
- #log.msg("sendUpdate", update)
- self.updateQueue.append(update)
- if self.remoteStep: # ?? send to Builder or BuildStep?
+ # the update[1]=0 comes from the leftover 'updateNum', which the
+ # master still expects to receive. Provide it to avoid significant
+ # interoperability issues between new slaves and old masters.
+ if self.remoteStep:
+ update = [data, 0]
updates = [update]
d = self.remoteStep.callRemote("update", updates)
d.addCallback(self.ackUpdate)
d.addErrback(self._ackFailed, "SlaveBuilder.sendUpdate")
- # these are utility routines used by sendStatus and commandComplete
-
- def dummy(self, value):
+ def ackUpdate(self, acknum):
+ # TODO: update the "last activity" timer
pass
- def ackUpdate(self, acknum):
- """Normally, the master responds to remote_update by returning the
- update number of the highest contiguous update received. That number
- comes back to this routine, which removes the acknowledged updates
- from the queue."""
- # XXX: revamp this, I think it needs a retransmission timeout to
- # deal with sendAllUpdates that don't all get acknowledged. Might be
- # ok, though.
- unacked = []
- for update in self.updateQueue:
- (data, updatenum) = update
- if updatenum > acknum:
- unacked.append(update)
- self.updateQueue = unacked
- # also, if the terminal status message (resulting from
- # commandComplete or commandFailed) is acked, we can finally get rid
- # of the command by clearing .stepRef and .command. We have to do
- # this, otherwise we'll think we're still running the command and
- # won't be able to answer remote_reattach correctly. XXX: is that
- # true?
+ def ackComplete(self, dummy):
+ # TODO: update the "last activity" timer
+ pass
def _ackFailed(self, why, where):
log.msg("SlaveBuilder._ackFailed:", where)
#log.err(why) # we don't really care
- def sendAllUpdates(self):
- """This is called after reattachment to send all queued updates."""
- if self.updateQueue and self.remoteStep:
- d = self.remoteStep.callRemote("update", self.updateQueue)
- d.addCallback(self.ackUpdate)
- d.addErrback(self._ackFailed, "SlaveBuilder.sendAllUpdates")
- def sendComplete(self, failure=None):
- # failure, if present, is a failure.Failure. To send it across the
- # wire, we must turn it into a pb.CopyableFailure.
+ # this is fired by the Deferred attached to each Command
+ def commandComplete(self, failure):
if failure:
+ log.msg("SlaveBuilder.commandFailed", self.command)
+ log.err(why)
+ # failure, if present, is a failure.Failure. To send it across
+ # the wire, we must turn it into a pb.CopyableFailure.
failure = pb.CopyableFailure(failure)
failure.unsafeTracebacks = True
- self.complete = [failure]
+ else:
+ # failure is None
+ log.msg("SlaveBuilder.commandComplete", self.command)
+ self.command = None
+ if not self.running:
+ return
if self.remoteStep:
+ self.remoteStep.dontNotifyOnDisconnect(self.lostRemoteStep)
d = self.remoteStep.callRemote("complete", failure)
d.addCallback(self.ackComplete)
d.addErrback(self._ackFailed, "sendComplete")
+ self.remoteStep = None
- def ackComplete(self, dummy):
- # this is the call that finally finishes the step
- self.finishCommand()
-
- def sendAllCompletes(self):
- if self.complete and self.remoteStep:
- d = self.remoteStep.callRemote("complete", self.complete[0])
- d.addCallback(self.ackComplete)
- d.addErrback(self._ackFailed, "sendAllCompletes")
-
- def remote_reattach(self, stepref, stepId):
- # were we executing something?
- if not self.command:
- raise NoCommandRunning
- # were we executing the same thing that they think we were?
- if self.command.stepId != stepId:
- raise WrongCommandRunning
- # send them our unacked status
- self.remoteStep = stepref
- self.sendAllUpdates()
- self.sendAllCompletes()
-
- def finishCommand(self):
- log.msg("SlaveBuilder.finishCommand", self.command)
- self.remoteStep.dontNotifyOnDisconnect(self.lostRemoteStep)
- self.remoteStep = None
- self.command = None
def remote_shutdown(self):
print "slave shutting down on command from master"
Index: commands.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/slave/commands.py,v
retrieving revision 1.17
retrieving revision 1.18
diff -u -d -r1.17 -r1.18
--- commands.py 3 Dec 2004 22:54:52 -0000 1.17
+++ commands.py 4 Dec 2004 21:02:03 -0000 1.18
@@ -305,7 +305,7 @@
If .running is False, the bot is shutting down (or has otherwise lost the
connection to the master), and should not send any status messages. This
- is taken care of in Command.sendStatus .
+ is checked in Command.sendStatus .
"""
@@ -329,7 +329,11 @@
pass
def start(self):
- # should return a Deferred
+ """Start the command. self.running will be set just before this is
+ called. This method should return a Deferred that will fire when the
+ command has completed.
+
+ This method should be overridden by subclasses."""
raise NotImplementedError, "You must implement this in a subclass"
def sendStatus(self, status):
@@ -343,10 +347,12 @@
def interrupt(self):
"""Override this in a subclass to allow commands to be interrupted.
- May be called multiple times, use self.interrupted=True if this
- matters."""
+ May be called multiple times, test and set self.interrupted=True if
+ this matters."""
pass
+ # utility methods, mostly used by SlaveShellCommand and the like
+
def _abandonOnFailure(self, rc):
if type(rc) is not int:
log.msg("weird, _abandonOnFailure was given rc=%s (%s)" % \
@@ -423,6 +429,8 @@
return self.d
def interrupt(self):
+ if self.interrupted:
+ return
self.timer.cancel()
self.timer = None
self.interrupted = True
@@ -578,6 +586,7 @@
command = ["rm", "-rf", d]
c = ShellCommand(self.builder, command, self.builder.basedir,
sendRC=0, timeout=self.timeout)
+ self.command = c
# sendRC=0 means the rm command will send stdout/stderr to the
# master, but not the rc=0 when it finishes. That job is left to
# _sendRC
@@ -595,6 +604,7 @@
command = ['cp', '-r', fromdir, todir]
c = ShellCommand(self.builder, command, self.builder.basedir,
sendRC=False, timeout=self.timeout)
+ self.command = c
d = c.start()
d.addCallback(self._abandonOnFailure)
return d
@@ -609,6 +619,7 @@
c = ShellCommand(self.builder, command, dir,
sendRC=False, timeout=self.timeout,
stdin=diff)
+ self.command = c
d = c.start()
d.addCallback(self._abandonOnFailure)
return d
@@ -651,6 +662,7 @@
c = ShellCommand(self.builder, command, d,
sendRC=False, timeout=self.timeout,
stdin=self.login+"\n")
+ self.command = c
d = c.start()
d.addCallback(self._abandonOnFailure)
d.addCallback(self._didLogin)
@@ -671,6 +683,7 @@
command += ['-D', self.revision]
c = ShellCommand(self.builder, command, d,
sendRC=False, timeout=self.timeout)
+ self.command = c
return c.start()
def doVCFull(self):
@@ -689,6 +702,7 @@
command += [self.cvsmodule]
c = ShellCommand(self.builder, command, d,
sendRC=False, timeout=self.timeout)
+ self.command = c
return c.start()
registerSlaveCommand("cvs", CVS, cvs_ver)
@@ -720,6 +734,7 @@
command = ['svn', 'update', '--revision', str(revision)]
c = ShellCommand(self.builder, command, d,
sendRC=False, timeout=self.timeout)
+ self.command = c
return c.start()
def doVCFull(self):
@@ -734,6 +749,7 @@
self.svnurl, self.srcdir]
c = ShellCommand(self.builder, command, d,
sendRC=False, timeout=self.timeout)
+ self.command = c
return c.start()
registerSlaveCommand("svn", SVN, cvs_ver)
@@ -765,6 +781,7 @@
command = ['darcs', 'pull', '--all', '--verbose']
c = ShellCommand(self.builder, command, d,
sendRC=False, timeout=self.timeout)
+ self.command = c
return c.start()
def doVCFull(self):
@@ -776,6 +793,7 @@
self.repourl]
c = ShellCommand(self.builder, command, d,
sendRC=False, timeout=self.timeout)
+ self.command = c
return c.start()
registerSlaveCommand("darcs", Darcs, cvs_ver)
@@ -812,6 +830,7 @@
command = ['tla', 'update']
c = ShellCommand(self.builder, command, d,
sendRC=False, timeout=self.timeout)
+ self.command = c
return c.start()
def doVCFull(self):
@@ -830,6 +849,7 @@
c = ShellCommand(self.builder, command, self.builder.basedir,
sendRC=False, keepStdout=True,
timeout=self.timeout)
+ self.command = c
d = c.start()
d.addCallback(self._abandonOnFailure)
d.addCallback(self._didRegister, c)
@@ -851,6 +871,7 @@
self.version, self.srcdir]
c = ShellCommand(self.builder, command, self.builder.basedir,
sendRC=False, timeout=self.timeout)
+ self.command = c
d = c.start()
d.addCallback(self._abandonOnFailure)
if self.buildconfig:
@@ -862,6 +883,7 @@
command = ['tla', 'build-config', self.buildconfig]
c = ShellCommand(self.builder, command, d,
sendRC=False, timeout=self.timeout)
+ self.command = c
d = c.start()
d.addCallback(self._abandonOnFailure)
return d
@@ -893,6 +915,7 @@
env = {'P4PORT': self.p4port}
c = ShellCommand(self.builder, command, d, environ=env,
sendRC=False, timeout=self.timeout)
+ self.command = c
return c.start()
def doVCFull(self):
From warner at users.sourceforge.net Sat Dec 4 21:02:06 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Sat, 04 Dec 2004 21:02:06 +0000
Subject: [Buildbot-commits] buildbot ChangeLog,1.320,1.321
Message-ID:
Update of /cvsroot/buildbot/buildbot
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv14277
Modified Files:
ChangeLog
Log Message:
* buildbot/slave/bot.py: clean up shutdown/lose-master code
(SlaveBuilder): make some attributes class-level, remove the old
"update queue" which existed to support resuming a build after the
master connection was lost. Try to reimplement that feature later.
(SlaveBuilder.stopCommand): clear self.command when the
SlaveCommand finishes, so that we don't try to kill a leftover one
at shutdown time.
(SlaveBuilder.commandComplete): same, merge with commandFailed and
.finishCommand
* buildbot/slave/commands.py (SourceBase): set self.command for
all VC commands, so they can be interrupted.
Index: ChangeLog
===================================================================
RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v
retrieving revision 1.320
retrieving revision 1.321
diff -u -d -r1.320 -r1.321
--- ChangeLog 3 Dec 2004 22:54:50 -0000 1.320
+++ ChangeLog 4 Dec 2004 21:02:04 -0000 1.321
@@ -1,3 +1,18 @@
+2004-12-04 Brian Warner
+
+ * buildbot/slave/bot.py: clean up shutdown/lose-master code
+ (SlaveBuilder): make some attributes class-level, remove the old
+ "update queue" which existed to support resuming a build after the
+ master connection was lost. Try to reimplement that feature later.
+ (SlaveBuilder.stopCommand): clear self.command when the
+ SlaveCommand finishes, so that we don't try to kill a leftover one
+ at shutdown time.
+ (SlaveBuilder.commandComplete): same, merge with commandFailed and
+ .finishCommand
+
+ * buildbot/slave/commands.py (SourceBase): set self.command for
+ all VC commands, so they can be interrupted.
+
2004-12-03 Brian Warner
* buildbot/master.py: clean up slave-handling code, to handle
From warner at users.sourceforge.net Sat Dec 4 21:12:22 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Sat, 04 Dec 2004 21:12:22 +0000
Subject: [Buildbot-commits] buildbot ChangeLog,1.321,1.322
Message-ID:
Update of /cvsroot/buildbot/buildbot
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv16232
Modified Files:
ChangeLog
Log Message:
(Dispatcher.requestAvatar): remove debug message that broke PBChangeSource
Index: ChangeLog
===================================================================
RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v
retrieving revision 1.321
retrieving revision 1.322
diff -u -d -r1.321 -r1.322
--- ChangeLog 4 Dec 2004 21:02:04 -0000 1.321
+++ ChangeLog 4 Dec 2004 21:12:20 -0000 1.322
@@ -1,5 +1,8 @@
2004-12-04 Brian Warner
+ * buildbot/master.py (Dispatcher.requestAvatar): remove debug
+ message that broke PBChangeSource
+
* buildbot/slave/bot.py: clean up shutdown/lose-master code
(SlaveBuilder): make some attributes class-level, remove the old
"update queue" which existed to support resuming a build after the
From warner at users.sourceforge.net Sat Dec 4 21:12:22 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Sat, 04 Dec 2004 21:12:22 +0000
Subject: [Buildbot-commits] buildbot/buildbot master.py,1.55,1.56
Message-ID:
Update of /cvsroot/buildbot/buildbot/buildbot
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv16232/buildbot
Modified Files:
master.py
Log Message:
(Dispatcher.requestAvatar): remove debug message that broke PBChangeSource
Index: master.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/master.py,v
retrieving revision 1.55
retrieving revision 1.56
diff -u -d -r1.55 -r1.56
--- master.py 3 Dec 2004 22:54:50 -0000 1.55
+++ master.py 4 Dec 2004 21:12:19 -0000 1.56
@@ -495,8 +495,6 @@
def requestAvatar(self, avatarID, mind, interface):
assert interface == pb.IPerspective
- log.msg("requestAvatar(%s) from %s" % \
- (avatarID, mind.broker.transport.getPeer()))
afactory = self.names.get(avatarID)
if afactory:
p = afactory.getPerspective()
From warner at users.sourceforge.net Sat Dec 4 21:18:26 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Sat, 04 Dec 2004 21:18:26 +0000
Subject: [Buildbot-commits] buildbot/buildbot/status words.py,1.29,1.30
Message-ID:
Update of /cvsroot/buildbot/buildbot/buildbot/status
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv17516/buildbot/status
Modified Files:
words.py
Log Message:
(IrcStatusBot.command_STOP): add a 'stop build' command to the IRC bot
Index: words.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/status/words.py,v
retrieving revision 1.29
retrieving revision 1.30
diff -u -d -r1.29 -r1.30
--- words.py 28 Nov 2004 05:20:23 -0000 1.29
+++ words.py 4 Dec 2004 21:18:24 -0000 1.30
@@ -256,6 +256,35 @@
d.addCallback(self.buildFinished, reply)
command_FORCE.usage = "force build - Force a build"
+ def command_STOP(self, user, reply, args):
+ args = args.split(None, 2)
+ if len(args) < 3 or args[0] != 'build':
+ raise UsageError, "try 'stop build WHICH '"
+ which = args[1]
+ reason = args[2]
+
+ buildercontrol = self.getControl(which)
+
+ who = None
+ r = "stopped: by IRC user <%s>: %s" % (user, reason)
+
+ # find an in-progress build
+ builderstatus = self.getBuilder(which)
+ buildstatus = builderstatus.getCurrentBuild()
+ if not buildstatus:
+ self.reply(reply, "sorry, no build is currently running")
+ return
+ num = buildstatus.getNumber()
+
+ # obtain the BuildControl object
+ buildcontrol = buildercontrol.getBuild(num)
+
+ # make it stop
+ bc.stopBuild(r)
+
+ self.reply(reply, "build %d interrupted" % num)
+ command_STOP.usage = "stop build - Stop a running build"
+
def emit_status(self, reply, which):
b = self.getBuilder(which)
str = "%s: " % which
From warner at users.sourceforge.net Sat Dec 4 21:18:26 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Sat, 04 Dec 2004 21:18:26 +0000
Subject: [Buildbot-commits] buildbot ChangeLog,1.322,1.323
Message-ID:
Update of /cvsroot/buildbot/buildbot
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv17516
Modified Files:
ChangeLog
Log Message:
(IrcStatusBot.command_STOP): add a 'stop build' command to the IRC bot
Index: ChangeLog
===================================================================
RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v
retrieving revision 1.322
retrieving revision 1.323
diff -u -d -r1.322 -r1.323
--- ChangeLog 4 Dec 2004 21:12:20 -0000 1.322
+++ ChangeLog 4 Dec 2004 21:18:24 -0000 1.323
@@ -1,5 +1,8 @@
2004-12-04 Brian Warner
+ * buildbot/status/words.py (IrcStatusBot.command_STOP): add a
+ 'stop build' command to the IRC bot
+
* buildbot/master.py (Dispatcher.requestAvatar): remove debug
message that broke PBChangeSource
From warner at users.sourceforge.net Sat Dec 4 21:20:21 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Sat, 04 Dec 2004 21:20:21 +0000
Subject: [Buildbot-commits] buildbot/buildbot/status words.py,1.30,1.31
Message-ID:
Update of /cvsroot/buildbot/buildbot/buildbot/status
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv18016
Modified Files:
words.py
Log Message:
fix stupid typo
Index: words.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/status/words.py,v
retrieving revision 1.30
retrieving revision 1.31
diff -u -d -r1.30 -r1.31
--- words.py 4 Dec 2004 21:18:24 -0000 1.30
+++ words.py 4 Dec 2004 21:20:18 -0000 1.31
@@ -280,7 +280,7 @@
buildcontrol = buildercontrol.getBuild(num)
# make it stop
- bc.stopBuild(r)
+ buildcontrol.stopBuild(r)
self.reply(reply, "build %d interrupted" % num)
command_STOP.usage = "stop build - Stop a running build"
From warner at users.sourceforge.net Sat Dec 4 22:17:03 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Sat, 04 Dec 2004 22:17:03 +0000
Subject: [Buildbot-commits] buildbot ChangeLog,1.323,1.324
Message-ID:
Update of /cvsroot/buildbot/buildbot
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv29472
Modified Files:
ChangeLog
Log Message:
update to current usage
Index: ChangeLog
===================================================================
RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v
retrieving revision 1.323
retrieving revision 1.324
diff -u -d -r1.323 -r1.324
--- ChangeLog 4 Dec 2004 21:18:24 -0000 1.323
+++ ChangeLog 4 Dec 2004 22:17:00 -0000 1.324
@@ -1,5 +1,7 @@
2004-12-04 Brian Warner
+ * docs/examples/twisted_master.cfg: update to current usage
+
* buildbot/status/words.py (IrcStatusBot.command_STOP): add a
'stop build' command to the IRC bot
From warner at users.sourceforge.net Sat Dec 4 22:17:02 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Sat, 04 Dec 2004 22:17:02 +0000
Subject: [Buildbot-commits] buildbot/docs/examples twisted_master.cfg,1.25,1.26
Message-ID:
Update of /cvsroot/buildbot/buildbot/docs/examples
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv29472/docs/examples
Modified Files:
twisted_master.cfg
Log Message:
update to current usage
Index: twisted_master.cfg
===================================================================
RCS file: /cvsroot/buildbot/buildbot/docs/examples/twisted_master.cfg,v
retrieving revision 1.25
retrieving revision 1.26
diff -u -d -r1.25 -r1.26
--- twisted_master.cfg 7 Nov 2004 20:17:34 -0000 1.25
+++ twisted_master.cfg 4 Dec 2004 22:16:59 -0000 1.26
@@ -76,7 +76,7 @@
'slavename': "bot1",
'builddir': "quick",
'factory': QuickTwistedBuildFactory(svnurl,
- python="python2.2"),
+ python=["python2.2", "python2.3"]),
}
builders.append(b1)
@@ -129,7 +129,7 @@
}
builders.append(b3)
-reactors = ['gtk2', 'gtk', 'poll']
+reactors = ['gtk2', 'gtk', 'qt', 'poll']
b4 = {'name': "reactors",
'slavename': "bot2",
'builddir': "reactors",
@@ -162,7 +162,7 @@
builders.append(b22w32)
b23bsd = {'name': "freebsd",
- 'slavename': "bot-dialtone",
+ 'slavename': "bot-suszko",
'builddir': "bsd-full2.2",
'factory': TwistedReactorsBuildFactory(svnurl,
python="python2.3",
@@ -204,7 +204,8 @@
c['manhole'] = master.Manhole(*private.manhole)
c['status'].append(client.PBListener(9936))
m = mail.MailNotifier(fromaddr="buildbot at twistedmatrix.com",
- builders=["quick", "full-2.2", "full-2.3", "full-2.4"],
+ #builders=["quick", "full-2.2", "full-2.3", "full-2.4"],
+ builders=["quick", "full-2.3"],
sendToInterestedUsers=True,
extraRecipients=["warner at lothar.com"],
mode="problem",
From warner at users.sourceforge.net Sat Dec 4 22:30:25 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Sat, 04 Dec 2004 22:30:25 +0000
Subject: [Buildbot-commits] buildbot/buildbot/test test_slavecommand.py,1.6,1.7
Message-ID:
Update of /cvsroot/buildbot/buildbot/buildbot/test
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32089/buildbot/test
Modified Files:
test_slavecommand.py
Log Message:
use sys.executable instead of hard-coding 'python' for child commands, might
help portability
Index: test_slavecommand.py
===================================================================
RCS file: /cvsroot/buildbot/buildbot/buildbot/test/test_slavecommand.py,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -d -r1.6 -r1.7
--- test_slavecommand.py 14 Oct 2004 17:28:34 -0000 1.6
+++ test_slavecommand.py 4 Dec 2004 22:30:23 -0000 1.7
@@ -11,7 +11,7 @@
import sys
startLogging(sys.stdout)
-import re, time
+import re, time, sys
import signal
from buildbot.slave.commands import SlaveShellCommand
@@ -115,7 +115,7 @@
self.assertEquals(got, expected)
def testShell1(self):
- cmd = "python emit.py 0"
+ cmd = sys.executable + " emit.py 0"
args = {'command': cmd, 'workdir': '.', 'timeout': 5}
failed = self.doTest(SlaveShellCommand, args)
self.failIf(failed)
@@ -124,7 +124,7 @@
self.checkrc(0)
def testShell2(self):
- cmd = "python emit.py 1"
+ cmd = sys.executable + " emit.py 1"
args = {'command': cmd, 'workdir': '.', 'timeout': 5}
failed = self.doTest(SlaveShellCommand, args)
self.failIf(failed)
@@ -133,7 +133,7 @@
self.checkrc(1)
def testShell3(self):
- cmd = "python emit.py 0"
+ cmd = sys.executable + " emit.py 0"
args = {'command': cmd, 'workdir': '.',
'env': {'EMIT_TEST': "envtest"}, 'timeout': 5}
failed = self.doTest(SlaveShellCommand, args)
@@ -145,7 +145,7 @@
self.checkrc(0)
def testShell4(self):
- cmd = "python emit.py 0"
+ cmd = sys.executable + " emit.py 0"
args = {'command': cmd, 'workdir': "subdir", 'timeout': 5}
failed = self.doTest(SlaveShellCommand, args)
self.failIf(failed)
From warner at users.sourceforge.net Sat Dec 4 22:30:25 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Sat, 04 Dec 2004 22:30:25 +0000
Subject: [Buildbot-commits] buildbot ChangeLog,1.324,1.325
Message-ID:
Update of /cvsroot/buildbot/buildbot
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32089
Modified Files:
ChangeLog
Log Message:
use sys.executable instead of hard-coding 'python' for child commands, might
help portability
Index: ChangeLog
===================================================================
RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v
retrieving revision 1.324
retrieving revision 1.325
diff -u -d -r1.324 -r1.325
--- ChangeLog 4 Dec 2004 22:17:00 -0000 1.324
+++ ChangeLog 4 Dec 2004 22:30:23 -0000 1.325
@@ -1,5 +1,8 @@
2004-12-04 Brian Warner
+ * buildbot/test/test_slavecommand.py: use sys.executable instead
+ of hard-coding 'python' for child commands, might help portability
+
* docs/examples/twisted_master.cfg: update to current usage
* buildbot/status/words.py (IrcStatusBot.command_STOP): add a
From warner at users.sourceforge.net Mon Dec 6 01:23:15 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Mon, 06 Dec 2004 01:23:15 +0000
Subject: [Buildbot-commits] buildbot/docs source.xhtml,1.3,1.4
Message-ID:
Update of /cvsroot/buildbot/buildbot/docs
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32477/docs
Modified Files:
source.xhtml
Log Message:
(Arch): correct terminology
Index: source.xhtml
===================================================================
RCS file: /cvsroot/buildbot/buildbot/docs/source.xhtml,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- source.xhtml 8 Sep 2004 19:36:17 -0000 1.3
+++ source.xhtml 6 Dec 2004 01:23:12 -0000 1.4
@@ -147,7 +147,7 @@
timestamp.
Arch specifies a repository by
-URL, as well as a revision which is kind of like a branch name. Arch
+URL, as well as a version which is kind of like a branch name. Arch
uses the word archive to represent the repository. Arch lets you push
changes from one archive to another, removing the strict centralization
required by CVS and SVN. It seems to retain the distinction between
From warner at users.sourceforge.net Mon Dec 6 01:23:15 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Mon, 06 Dec 2004 01:23:15 +0000
Subject: [Buildbot-commits] buildbot ChangeLog,1.325,1.326
Message-ID:
Update of /cvsroot/buildbot/buildbot
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv32477
Modified Files:
ChangeLog
Log Message:
(Arch): correct terminology
Index: ChangeLog
===================================================================
RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v
retrieving revision 1.325
retrieving revision 1.326
diff -u -d -r1.325 -r1.326
--- ChangeLog 4 Dec 2004 22:30:23 -0000 1.325
+++ ChangeLog 6 Dec 2004 01:23:13 -0000 1.326
@@ -1,3 +1,7 @@
+2004-12-05 Brian Warner
+
+ * docs/source.xhtml (Arch): correct terminology
+
2004-12-04 Brian Warner
* buildbot/test/test_slavecommand.py: use sys.executable instead
From warner at users.sourceforge.net Mon Dec 6 03:09:27 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Mon, 06 Dec 2004 03:09:27 +0000
Subject: [Buildbot-commits] buildbot/docs slave.xhtml,NONE,1.1
Message-ID:
Update of /cvsroot/buildbot/buildbot/docs
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv22115/docs
Added Files:
slave.xhtml
Log Message:
provide a buildslave setup checklist
--- NEW FILE: slave.xhtml ---
Buildslave Configuration
Buildslave Configuration
This document describes the basics of setting up a buildslave.
Typically, you will be adding a buildslave to an existing buildmaster, to
provide additional architecture coverage. The buildbot administrator will
give you several pieces of information necessary to connect to the
buildmaster. You should also be somewhat familiar with the project being
tested, so you can troubleshoot build problems locally.
The buildbot exists to make sure that the project's stated how to build
it process actually works. To this end, the buildslave should run in an
environment just like that of your regular developers. Typically the project
build process is documented somewhere (README, INSTALL, etc),
in a document that should mention all library dependencies and contain a
basic set of build instructions. This document will be useful as you
configure the host and account in which the buildslave runs.
Checklist
You will need the following pieces before starting your buildslave for the
first time:
Set up the account: It is recommended (although not mandatory)
to set up a separate user account for the buildslave. This account is
frequently named 'buildbot'. This serves to isolate your personal working
environment from that of the slave's, and helps to minimize the security
threat posed by letting possibly-unknown contributors run arbitrary code on
your system. The account should have a minimum of fancy init scripts.
Set up the host: Make sure the host can actually reach the
buildmaster. Usually the buildmaster is running a status webserver on the
same machine, so simply point your web browser at it and see if you can get
there. Install whatever additional packages or libraries the project's
INSTALL document advises. (or not, if your buildslave is supposed to make
sure that building without optional libraries still works, then don't
install those libraries).
Test the build process: Follow the instructions in the INSTALL
document, in the buildbot account. Perform a full CVS (or whatever)
checkout, configure, make, run tests, etc. Confirm that the build works
without manual fussing. If it doesn't work when you do it by hand, it will
be unlikely to work when the buildbot attempts to do it in an automated
fashion.
Choose a base directory: This should be somewhere in the
buildbot account, typically named after the project which is being tested.
The buildslave will not touch any file outside of this directory.
Create the hostinfo files: When it first connects, the
buildslave will send a few files up to the buildmaster which describe the
host that it is running on. These files are presented on the web status
display so that developers have more information to reproduce any test
failures that are witnessed by the buildbot. Create a directory named
'info' (directly under the buildbot home directory would be a good place).
Inside it, make a file named 'admin', and put your name/email in it. This
is the buildslave admin address, and will be reachable from the
build status page (so you may wish to munge it a bit if address-harvesting
spambots are a concern). Create a second file named 'host' and fill it with
a brief description of the host: OS, version, memory size, CPU speed,
versions of relevant libraries installed, and finally the version of the
buildbot code which is running the buildslave. You will create a symlink to
this directory when you finally set up the buildslave.
Get the buildmaster host/port, botname, and password: The
buildmaster admin will give you a hostname:portno pair which specifies the
TCP port on which the buildmaster is expecting connections from the
buildslaves. They will also assign you a name and password which your bot
will use.
Install the buildbot code: Obtain the latest tarball from buildbot.sf.net, verify the signature,
unpack it, then do the usual python ./setup.py build; sudo python
./setup.py install dance. If you do not have root on this host, you
can install it into a different directory as long as you remember to add it
to the PYTHONPATH environment variable at the right time. python
./setup.py install --home=~ and then
PYTHONPATH=~/lib/python is a common technique.
Configuring the buildslave
With all that setup ready, you are ready to create the buildslave. There
is a tool provided in the buildbot package named (simply enough)
buildbot, which usually gets installed to
/usr/bin/buildbot. This tool provides a front-end to a Twisted
program named mktap. When you use buildbot to
create the buildslave, it will create the base directory for you (and
complain if it already exists). Take the BASEDIR you've picked, the HOST:PORT
buildmaster location, and the BOTNAME and PASSWORD you've been assigned, and
run buildbot as follows:
buildbot slave BASEDIR HOST:PORT BOTNAME PASSWORD
That will create and populate BASEDIR with some setup files. The
buildbot.tap file contains a freeze-dried buildslave
object, ready to be run by Twisted's daemon-launching utility
twistd (pronounced twist-dee). That's it. In the future,
buildbot slave will probably do more setup.
Now symlink your hostinfo directory into place:
cd BASEDIR
ln -s ~/hostinfo ./info
Your buildslave is now ready to run!
Starting the buildslave
To start the buildslave manually, just use the buildbot tool
again:
buildbot start BASEDIR
This will start any freeze-dried application found in the given
directory. (The same command is used to start a buildmaster instance). Note
that buildbot start is really just a front end for
twistd.
As soon as the buildslave starts, you should find two new files in its
base directory. The first is named twistd.pid, and simply
contains the process ID of the buildslave's twistd process. You
can use ps to find it in your process table. The second is
twistd.log, and is the buildslave's log file. Everything the
buildslave does is recorded in this file. It is the first place to look for
error messages or exception traces.
Once the buildslave connects to the buildmaster, new directories will
start appearing in the base directory. The buildmaster tells the slave to
create a directory for each builder which will be using that slave.
Within these directories, CVS checkouts, compiles, and tests are
performed.
Making sure the buildslave starts at each reboot
Before you are done, you need to make sure the buildslave will keep
running even if the host reboots. The easiest way I'm found to do this is to
add a @reboot crontab entry. Most modern versions of cron
interpret a time specification of @reboot to indicate that the
given job should be run the first time cron is started after system boot.
Something like the following usually works:
@reboot buildbot start BASEDIR
It is important to remember that the environment provided to cron jobs can
be quite different that your normal runtime. There may be fewer environment
variables specified, and the PATH may be shorter than usual. It is a good
idea to test out this method of launching the buildslave by using a time in
the near future, with the same command, and then check twistd.log to make
sure the slave actually started correctly.
Shutting down the buildslave
To stop the buildslave manually, use the buildbot tool
again:
buildbot stop BASEDIR
This simply looks for the twistd.pid file and kills whatever
process is identified within.
At system shutdown, all processes are sent a SIGKILL. The buildslave will
respond to this by shutting down normally.
Troubleshooting
Cron jobs are typically run with a minimal shell (/bin/sh, not /bin/bash),
and tilde expansion is not always performed in such commands. You may want to
use explicit paths, because the PATH is usually quite short and doesn't
include anything set by your shell's startup scripts (.profile, .bashrc,
etc). If you've installed buildbot (or other python libraries) to an unusual
location, you may need to add a PYTHONPATH specification (note that python
will do tilde-expansion on PYTHONPATH elements by itself).
Take the time to get the @reboot job set up. Otherwise, things will work
fine for a while, but the first power outage or system reboot you have will
stop the buildslave with nothing but the cries of sorrowful developers to
remind you that it has gone away.
From the buildmaster's main status web page, you can force a build to be
run on your build slave. Figure out which column is for a builder that runs
on your slave, click on that builder's name, and the page that comes up will
have a Force Build button. Fill in the form, hit the button, and a
moment later you should see your slave's twistd.log filling with
commands being run. Using pstree or top should also
reveal the cvs/make/gcc/etc processes being run by the buildslave. Note that
the same web page should also show the admin and host
information files that you configured earlier.
From warner at users.sourceforge.net Mon Dec 6 03:09:27 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Mon, 06 Dec 2004 03:09:27 +0000
Subject: [Buildbot-commits] buildbot ChangeLog,1.326,1.327
Message-ID:
Update of /cvsroot/buildbot/buildbot
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv22115
Modified Files:
ChangeLog
Log Message:
provide a buildslave setup checklist
Index: ChangeLog
===================================================================
RCS file: /cvsroot/buildbot/buildbot/ChangeLog,v
retrieving revision 1.326
retrieving revision 1.327
diff -u -d -r1.326 -r1.327
--- ChangeLog 6 Dec 2004 01:23:13 -0000 1.326
+++ ChangeLog 6 Dec 2004 03:09:25 -0000 1.327
@@ -1,5 +1,7 @@
2004-12-05 Brian Warner
+ * docs/slave.xhtml: provide a buildslave setup checklist
+
* docs/source.xhtml (Arch): correct terminology
2004-12-04 Brian Warner
From warner at users.sourceforge.net Mon Dec 6 03:31:49 2004
From: warner at users.sourceforge.net (Brian Warner)
Date: Mon, 06 Dec 2004 03:31:49 +0000
Subject: [Buildbot-commits] buildbot/docs slave.xhtml,1.1,1.2
Message-ID:
Update of /cvsroot/buildbot/buildbot/docs
In directory sc8-pr-cvs1.sourceforge.net:/tmp/cvs-serv26287
Modified Files:
slave.xhtml
Log Message:
more notes
Index: slave.xhtml
===================================================================
RCS file: /cvsroot/buildbot/buildbot/docs/slave.xhtml,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -d -r1.1 -r1.2
--- slave.xhtml 6 Dec 2004 03:09:24 -0000 1.1
+++ slave.xhtml 6 Dec 2004 03:31:46 -0000 1.2
@@ -186,7 +186,21 @@
respond to this by shutting down normally.
-
Troubleshooting
+
Maintenance
+
+
It is a good idea to check the buildmaster's status page every once in a
+while, to see if your buildslave is still online. Eventually the buildbot
+will probably be enhanced to send you email (via the info/admin email
+address) when the slave has been offline for more than a few hours.
+
+
If you find you can no longer provide a buildslave to the project, please
+let the project admins know, so they can put out a call for a
+replacement.
+
+
+
Troubleshooting
+
+
Starting the buildslave
Cron jobs are typically run with a minimal shell (/bin/sh, not /bin/bash),
and tilde expansion is not always performed in such commands. You may want to
@@ -205,6 +219,26 @@
stop the buildslave with nothing but the cries of sorrowful developers to
remind you that it has gone away.
+
+
Connecting to the buildmaster
+
+
If the buildslave cannot connect to the buildmaster, the reason should be
+described in the twistd.log logfile. Some common problems are an
+incorrect master hostname or port number, or a mistyped bot name or password.
+If the buildslave loses the connection to the master, it is supposed to
+attempt to reconnect with an exponentially-increasing backoff. Each attempt
+(and the time of the next attempt) will be logged. If you get impatient, just
+manually stop and re-start the buildslave.
+
+
When the buildmaster is restarted, all slaves will be disconnected, and
+will attempt to reconnect as usual. The reconnect time will depend upon how
+long the buildmaster is offline (i.e. how far up the exponential backoff
+curve the slaves have travelled). Again, buildbot stop BASEDIR;
+buildbot start BASEDIR will speed up the process.
+
+
+
Running builds
+
From the buildmaster's main status web page, you can force a build to be
run on your build slave. Figure out which column is for a builder that runs
on your slave, click on that builder's name, and the page that comes up will
@@ -215,5 +249,4 @@
the same web page should also show the admin and host
information files that you configured earlier.