[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[no subject]
From: |
Ludovic Courtès |
Date: |
Thu, 5 Apr 2018 16:26:33 -0400 (EDT) |
branch: master
commit 074b9d02f1ca01007f39adbc019763027a51d9bd
Author: Ludovic Courtès <address@hidden>
Date: Thu Apr 5 22:17:45 2018 +0200
base: Let sqlite handle deduplication of the list of pending derivations.
Previously we would make a SQL query that would return many build jobs,
and then call 'delete-duplicates' on that. This was extremely wasteful
because the list of returned by the query was huge leading to a heap of
several tens of GiB on a big database, and 'delete-duplicates' would
lead to more GC and it would take ages.
Furthermore, since 'delete-duplicates' is written in C as of Guile
2.2.3, it is uninterruptible from Fiber's viewpoint. Consequently, the
kernel thread running the 'restart-builds' fiber would never schedule
other fibers, which could lead to deadlocks--e.g., since fibers are
scheduled on a circular shuffled list of kernel threads, once every N
times, a web server fiber would be sent to that kernel thread and not be
serviced.
* src/cuirass/base.scm (shuffle-jobs): Remove.
(shuffle-derivations): New procedure.
(spawn-builds): Take a list of derivations instead of a list of jobs.
(restart-builds): Remove 'builds' parameter. Remove 'delete-duplicates'
call. Remove done/remaining partitioning.
(build-packages): Adjust to pass 'spawn-builds' a list of derivations.
* bin/cuirass.in (main): Remove computation of PENDING. Remove second
parameter in call to 'restart-builds'.
---
bin/cuirass.in | 11 +++----
src/cuirass/base.scm | 82 ++++++++++++++++++----------------------------------
2 files changed, 32 insertions(+), 61 deletions(-)
diff --git a/bin/cuirass.in b/bin/cuirass.in
index fa0d6af..d27167c 100644
--- a/bin/cuirass.in
+++ b/bin/cuirass.in
@@ -128,12 +128,9 @@ exec ${GUILE:address@hidden@} --no-auto-compile -e main -s
"$0" "$@"
new-specs)))
(if one-shot?
(process-specs db (db-get-specifications db))
- (let ((exit-channel (make-channel))
- (pending
- (begin
- (clear-build-queue db)
- (log-message "retrieving list of pending
builds...")
- (db-get-builds db '((status pending))))))
+ (let ((exit-channel (make-channel)))
+
+ (clear-build-queue db)
;; First off, restart builds that had not completed or
;; were not even started on a previous run.
@@ -142,7 +139,7 @@ exec ${GUILE:address@hidden@} --no-auto-compile -e main -s
"$0" "$@"
'restart-builds exit-channel
(lambda ()
(with-database db
- (restart-builds db pending)))))
+ (restart-builds db)))))
(spawn-fiber
(essential-task
diff --git a/src/cuirass/base.scm b/src/cuirass/base.scm
index a96f640..c9c5ec1 100644
--- a/src/cuirass/base.scm
+++ b/src/cuirass/base.scm
@@ -362,15 +362,10 @@ Essentially this procedure inverts the
inversion-of-control that
;;; Building packages.
;;;
-(define (shuffle-jobs jobs)
- "Shuffle JOBS, a list of job alists."
+(define (shuffle-derivations drv)
+ "Shuffle DRV, a list of derivation file names."
;; Our shuffling algorithm is simple: we sort by .drv file name. :-)
- (define (job<? job1 job2)
- (let ((drv1 (assq-ref job1 #:derivation))
- (drv2 (assq-ref job2 #:derivation)))
- (string<? drv1 drv2)))
-
- (sort jobs job<?))
+ (sort drv string<?))
(define (update-build-statuses! store db lst)
"Update the build status of the derivations listed in LST, which have just
@@ -397,11 +392,10 @@ and returns the values RESULTS."
(print-exception (current-error-port) frame key args)
(apply values results)))))
-(define* (spawn-builds store db jobs
+(define* (spawn-builds store db drv
#:key (max-batch-size 200))
- "Build the derivations associated with JOBS, a list of job alists, updating
-DB as builds complete. Derivations are submitted in batches of at most
-MAX-BATCH-SIZE items."
+ "Build the derivations listed in DRV, updating DB as builds complete.
+Derivations are submitted in batches of at most MAX-BATCH-SIZE items."
;; XXX: We want to pass 'build-derivations' as many derivations at once so
;; we benefit from as much parallelism as possible (we must be using
;; #:keep-going? #t).
@@ -419,31 +413,27 @@ MAX-BATCH-SIZE items."
;; This code works around it by submitting derivations in batches of at most
;; MAX-BATCH-SIZE.
- (define total (length jobs))
+ (define total (length drv))
(log-message "building ~a derivations in batches of ~a"
total max-batch-size)
- ;; Shuffle jobs so that we don't build sequentially i686/x86_64/aarch64,
+ ;; Shuffle DRV so that we don't build sequentially i686/x86_64/aarch64,
;; master/core-updates, etc., which would be suboptimal.
- (let loop ((jobs (shuffle-jobs jobs))
+ (let loop ((drv (shuffle-derivations drv))
(count total))
(if (zero? count)
(log-message "done with ~a derivations" total)
(let*-values (((batch rest)
(if (> count max-batch-size)
- (split-at jobs max-batch-size)
- (values jobs '())))
- ((drv)
- (map (lambda (job)
- (assq-ref job #:derivation))
- batch)))
+ (split-at drv max-batch-size)
+ (values drv '()))))
(guard (c ((nix-protocol-error? c)
(log-message "batch of builds (partially) failed:\
~a (status: ~a)"
(nix-protocol-error-message c)
(nix-protocol-error-status c))))
- (log-message "building batch of ~a jobs (~a/~a)"
+ (log-message "building batch of ~a derivations (~a/~a)"
max-batch-size (- total count) total)
(let-values (((port finish)
(build-derivations& store drv)))
@@ -526,43 +516,26 @@ procedure is meant to be called at startup."
(- (time-second (current-time time-utc)) age)
";"))
-(define (restart-builds db builds)
+(define (restart-builds db)
"Restart builds whose status in DB is \"pending\" (scheduled or started)."
(with-store store
- (let*-values (((builds)
- (delete-duplicates builds build-derivation=?))
- ((valid stale)
- (partition (lambda (build)
- (let ((drv (assq-ref build #:derivation)))
- (valid-path? store drv)))
- builds)))
+ ;; Note: On a big database, 'db-get-pending-derivations' can take a couple
+ ;; of minutes, hence 'non-blocking'.
+ (log-message "retrieving list of pending builds...")
+ (let*-values (((valid stale)
+ (partition (cut valid-path? store <>)
+ (non-blocking (db-get-pending-derivations db)))))
;; We cannot restart builds listed in STALE, so mark them as canceled.
(log-message "canceling ~a stale builds" (length stale))
- (for-each (lambda (build)
- (db-update-build-status! db (assq-ref build #:derivation)
- (build-status canceled)))
+ (for-each (lambda (drv)
+ (db-update-build-status! db drv (build-status canceled)))
stale)
- ;; Those in VALID can be restarted, but some of them may actually be
- ;; done already--either because our database is outdated, or because it
- ;; was not built by Cuirass.
- (let-values (((done remaining)
- (partition (lambda (build)
- (match (assq-ref build #:outputs)
- (((name ((#:path . item))) _ ...)
- (valid-path? store item))
- (_ #f)))
- valid)))
- (log-message "~a of the pending builds had actually completed"
- (length done))
- (for-each (lambda (build)
- (db-update-build-status! db (assq-ref build #:derivation)
- (build-status succeeded)))
- done)
-
- (log-message "restarting ~a pending builds" (length remaining))
- (spawn-builds store db remaining)
- (log-message "done with restarted builds")))))
+ ;; Those in VALID can be restarted. If some of them were built in the
+ ;; meantime behind our back, that's fine: 'spawn-builds' will DTRT.
+ (log-message "restarting ~a pending builds" (length valid))
+ (spawn-builds store db valid)
+ (log-message "done with restarted builds"))))
(define (build-packages store db jobs)
"Build JOBS and return a list of Build results."
@@ -595,7 +568,8 @@ procedure is meant to be called at startup."
(define build-ids
(map register jobs))
- (spawn-builds store db jobs)
+ (spawn-builds store db
+ (map (cut assq-ref <> #:derivation) jobs))
(let* ((results (filter-map (cut db-get-build db <>) build-ids))
(status (map (cut assq-ref <> #:status) results))