[Server-devel] [PATCH] Introducing ds-cleanup - free up space cleaning up old snapshots

martin.langhoff at gmail.com martin.langhoff at gmail.com
Tue Jul 22 02:36:14 EDT 2008


From: Martin Langhoff <martin at laptop.org>

This is a dangerous but important job. With this patch we introduce
the concept of a per-service quota. The ds-backup quota defaults to
70% of the disk on which /library/users is, but can be overriden /
controlled from /etc/xs_quota.conf

If the (soft) quota is met, this script will first remove daily
snapshots that are older than its MAX_AGE_DAILY (default: 90 days)
keeping only one monthly snapshot (the earliest of each month).

If after the cleanup of the older dailies we are over quota, it will
remove old snapshots of users that are over the user quota (defined
as ds-backup quota / # of users) until things fit within the quota.

This script has the ability to delete all the user data we are
intending to preserve.
---
 Makefile.build                    |    3 +
 ds-backup.spec.in                 |    3 +
 server/cron-ds-backup-server.conf |    8 ++
 server/ds-cleanup.py              |  188 +++++++++++++++++++++++++++++++++++++
 server/ds-cleanup.sh              |   50 ++++++++++
 5 files changed, 252 insertions(+), 0 deletions(-)
 create mode 100644 server/cron-ds-backup-server.conf
 create mode 100755 server/ds-cleanup.py
 create mode 100755 server/ds-cleanup.sh

diff --git a/Makefile.build b/Makefile.build
index 1a9af81..f497ebb 100644
--- a/Makefile.build
+++ b/Makefile.build
@@ -11,10 +11,13 @@ install-client:
 install-server:
 	install -D -d $(DESTDIR)/usr/bin
 	install -D server/ds-postprocess.py $(DESTDIR)/usr/bin
+	install -D server/ds-cleanup.sh $(DESTDIR)/usr/bin
+	install -D server/ds-cleanup.py $(DESTDIR)/usr/bin
 	install -D -d $(DESTDIR)/var/www/ds-backup
 	install -D server/backup-available.py $(DESTDIR)/var/www/ds-backup
 	install -D server/ds-restore.php      $(DESTDIR)/var/www/ds-backup
 	install -D -m 644 server/incron-ds-backup.conf  $(DESTDIR)/etc/incron.d/ds-backup.conf
+	install -D -m 644 server/cron-ds-backup-server.conf  $(DESTDIR)/etc/cron.d/ds-backup-server.conf
 	install -D -m 644 server/apache-ds-backup.conf  $(DESTDIR)/etc/httpd/conf.d/050-ds-backup.conf
 	install -D -d $(DESTDIR)/var/lib/ds-backup
 	# ownerships are set in the spec file - this execs as nonroot in rpmbuild
diff --git a/ds-backup.spec.in b/ds-backup.spec.in
index 43ac878..076b82e 100644
--- a/ds-backup.spec.in
+++ b/ds-backup.spec.in
@@ -82,10 +82,13 @@ service httpd restart
 %defattr(-,root,root,-)
 %doc README COPYING AUTHORS
 %config(noreplace) %{_sysconfdir}/httpd/conf.d/050-ds-backup.conf
+%config(noreplace) %{_sysconfdir}/cron.d/ds-backup-server.conf
 # incron will mess up with an rpmsave file
 # so replace inconditionally
 %config %{_sysconfdir}/incron.d/ds-backup.conf
 %{_bindir}/ds-postprocess.py
+%{_bindir}/ds-cleanup.sh
+%{_bindir}/ds-cleanup.py
 /var/www/ds-backup/backup-available.py
 /var/www/ds-backup/ds-restore.php
 %attr(700, apache, apache) %dir %{_localstatedir}/lib/ds-backup/recentclients
diff --git a/server/cron-ds-backup-server.conf b/server/cron-ds-backup-server.conf
new file mode 100644
index 0000000..7a5b9ab
--- /dev/null
+++ b/server/cron-ds-backup-server.conf
@@ -0,0 +1,8 @@
+# Blame: martin at laptop.org
+# 
+MAILTO=""
+PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin
+SHELL=/bin/bash
+*/30 * * * * root (/usr/bin/ds-cleanup.sh 2>&1 ) > /dev/null
+
+## To debug, replace > /dev/null above with >>/var/log/ds-cleanup.log
\ No newline at end of file
diff --git a/server/ds-cleanup.py b/server/ds-cleanup.py
new file mode 100755
index 0000000..bcf9285
--- /dev/null
+++ b/server/ds-cleanup.py
@@ -0,0 +1,188 @@
+#!/usr/bin/python
+#
+# This is the datastore backup cleanup.
+#
+# Invoke once an hour from cron.d, can be
+# invoked by the postprocess script if
+# we are tight on disk space on the partition.
+#
+# This script will run to completion only once a day
+# but the XS may not get power all day.
+#  
+
+# 3 months - 90 days max keep for daily snapshots
+MAX_AGE_DAILY=90
+# 70% default softquota
+DS_SOFT_QUOTA=70
+
+# Execution plan
+#
+#
+# - Keep one monthly snapshot per user for snapshots
+#   older than 3 months (MAX_AGE_DAILY).
+#   Laptops will attempt daily backups - so we keep all the
+#   dailies the first 90 days...
+#
+# - Service and user fairness
+#   Be default 70% the /library/users partition is for ds-backup.
+#   This can be overriden by an /etc/xs_quota.conf file 
+#
+#   Past the threshold, we will count the user accts with datastore dirs
+#   divide the avaialble blocks on the partition, du -sh on each user, and
+#   trim old monthly snapshots of the heaviest users until we are below
+#   the quota.
+#
+#   The fact that we are undertaking wide hardlinking means that deleting old
+#   snapshots might not free up as much as expected.
+#
+# - Hard-link across users? TODO :-)
+#
+#
+
+import syck
+import os
+import sys
+import subprocess
+from subprocess import PIPE
+import re
+import time
+
+basehomedir = '/library/users'
+
+# max 5% loadavg - run only on idle...
+if (os.getloadavg()[0] > 5):
+    sys.exit0()
+
+if os.path.exists('/etc/xs-quotas.conf'):
+    #qf = file('/etc/xs-quotas.conf', 'r')
+    quotaconf = syck.load(open('/etc/xs-quotas.conf', 'r').read());
+    if (quotaconf.has_key('ds-backup')):
+        quotaconf['ds-backup'] = int(quotaconf['ds-backup'])
+        if (quotaconf['ds-backup'] < 100):
+            DS_SOFT_QUOTA = quotaconf['ds-backup']
+        else:
+            sys.stderr.write('Odd quota')
+
+# take a measure of disk usage...
+# multiply the 'totals' by .95 to reflect that root
+# has 5% of the disk resources set aside
+libstat = os.statvfs(basehomedir)
+usedblockspc = 1 - float(libstat[4])/(libstat[2]*0.95)
+usedfnodespc = 1 - float(libstat[7])/(libstat[5]*0.95)
+
+# if below the quota, mark as done, do nothing
+if (usedblockspc < (DS_SOFT_QUOTA/100.0) and
+    usedfnodespc < (DS_SOFT_QUOTA/100.0)):
+    # mark as done for the day
+    os.system('touch /var/lib/ds-backup/ds-cleanup-done')
+    sys.exit(0)
+
+#
+# Remove dailies older than MAX_AGE_DAILY
+#
+# 
+# `find /library/users -maxdepth 2 -mindepth 2 -type d -name 'datastore-[0-9]*' | sort`
+# unfortunately, the piping below is memory-bound.
+#
+# TODO: `find|sort > tmpfile` and then read line by line from tmpfile
+# so only the sort stage is memory-bound, and we free up RAM for others.
+pfind  = subprocess.Popen(['find', basehomedir, '-maxdepth','2','-mindepth','2',
+               '-type', 'd', '-name', 'datastore-[0-9]*'], stdout=PIPE)
+psort = subprocess.Popen(['sort'],
+              stdin=pfind.stdout, stdout=PIPE)
+
+# Prepare for the job...
+# by making cutdate a string, the comparison later is logically simple
+# and fast.
+rex = re.compile(basehomedir+'/(.+?)/datastore-(\d\d\d\d)-(\d\d)-(\d\d)')
+cutdate = time.gmtime(time.time() - MAX_AGE_DAILY *60*60*24)
+cutdate =  ('%04d%02d%02d' % (cutdate[0], cutdate[1], cutdate[2]))
+
+# The directories will come in ASCENDING
+# order. So we will only keep the first of
+# each user/year/month 
+lastuserid = None
+lastyear   = None
+lastmonth  = None
+while 1:
+    ds_snapshot = psort.stdout.readline()
+    if not ds_snapshot:
+        break
+    ds_snapshot = ds_snapshot.rstrip()
+
+    m = rex.match(ds_snapshot)
+    if m:
+        (userid, year, month, day)= m.groups();
+        # same user,year,month
+        if (userid==lastuserid and year==lastyear and month==lastmonth
+            and cutdate > year+month+day):
+            # Call scary rm -fr -- using sudo to confine
+            # it to the approp
+            subprocess.call(['sudo', '-u', userid, 'rm', '-fr',
+                             '--one-file-system', '--', ds_snapshot])
+        # keep track of last-seen vars
+        lastuserid = userid
+        lastyear   = year
+        lastmonth  = month
+
+
+for retries in range(10):
+    # If we did this loop with a while 1 we could
+    # get into an infinite loop where we are over quota
+    # and no amount of pruning of ds snapshots
+    # can help. Some situations like a huge _current_ snapshot
+    # can DoS the backup service.
+
+    # take a measure of disk usage -- 
+    # multiply the 'totals' by .95 to reflect that root
+    # has 5% of the disk resources set aside
+    libstat = os.statvfs(basehomedir)
+    usedblockspc = 1 - float(libstat[4])/(libstat[2]*0.95)
+    usedfnodespc = 1 - float(libstat[7])/(libstat[5]*0.95)
+
+    # if below the quota, mark as done, do nothing
+    if (usedblockspc < (DS_SOFT_QUOTA/100.0) and
+        usedfnodespc < (DS_SOFT_QUOTA/100.0)):
+        # mark as done for the day
+        os.system('touch /var/lib/ds-backup/ds-cleanup-done')
+        sys.exit(0)
+
+    # sys.stderr.write('over the quota? '+str(usedblockspc) + '<' +str(DS_SOFT_QUOTA/100.0)+"\n")
+
+    ##
+    ## Remove old snapshots of users over the implied per-user quota
+    ## note that as user accounts are added we will dynamically shrink
+    ## the per-user quota.
+    ##
+    # surprise - we do want shell expansion here
+    # with subprocess '*' does not work :-/
+    userdirs = os.popen('du -s ' + basehomedir + '/*').readlines()
+
+    usercount = len(userdirs)
+    userquota = ((DS_SOFT_QUOTA/100.0) * libstat[2]) / usercount
+    userquota = int(userquota)
+
+    # Remove one old snapshot of every user over the threshold - the oldest one
+
+    while len(userdirs):
+        userdir    = userdirs.pop()
+        userblocks = int(userdir.split("\t")[0])
+        # du fakes the blocks to 1K while the
+        # quota blocks we have might have a different blocksize
+        userblocks = userblocks * (libstat[0] / 1024)
+        if userblocks > userquota:
+            userdir = userdir.split("\t")[1].rstrip()
+            pfind   = subprocess.Popen(['find', userdir, '-maxdepth','1','-mindepth','1',
+                                        '-type', 'd', '-name', 'datastore-[0-9]*'], stdout=PIPE)
+            psort = subprocess.Popen(['sort'],
+                                     stdin=pfind.stdout, stdout=PIPE)
+            ds_snapshot = psort.stdout.readline().rstrip()
+            m = rex.match(ds_snapshot)
+            if m:
+                (userid, year, month, day)= m.groups();
+                subprocess.call(['sudo', '-u', userid, 'rm', '-fr',
+                                 '--one-file-system', '--', ds_snapshot])
+
+
+# done
+os.system('touch /var/lib/ds-backup/ds-cleanup-done')
diff --git a/server/ds-cleanup.sh b/server/ds-cleanup.sh
new file mode 100755
index 0000000..d5d6526
--- /dev/null
+++ b/server/ds-cleanup.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+#
+#
+# Wrapper around ds-cleanup - 
+#
+# Author: Martin Langhoff <martin at laptop.org>
+#
+
+##
+## We use skip_ifrecent() to ensure a daily run
+## even in the face of unreliable power - so schedule
+## this on cron with a reasonable frequency rather
+## than once daily.
+##
+
+# If we have executed up recently, leave it for later. Use
+# -mtime 0 for "today"
+# -mtime -1 for "since yesterday"
+# -mtime -10 for in the last 10 days
+#
+# Using -daystart means that the script is more eager to run
+# from early each day. Without -daystart, backups tend to happen
+# later and later everyday, as they only start trying after 24hs...
+#
+# Another tack could be to try -mmin -1200 (20hs) - 
+#
+function skip_ifrecent {
+    RECENT_CHECK='-daystart -mtime 0'
+    if [ `find /var/lib/ds-backup/ds-cleanup-done $RECENT_CHECK 2>/dev/null` ]
+    then
+	exit 0
+    fi
+}
+skip_ifrecent;
+
+# Execute ds-cleanup.py from the same
+# directory where we are. Use a flock
+# to prevent concurrent runs. If the
+# flock does not succeed immediately,
+# we quit.
+LOCKFILE=/var/lib/ds-backup/ds-cleanup.run
+
+# this script is IO heavy, and not
+# a priority, so we run it under ionice -c3 (idle class)
+
+flock -n $LOCKFILE ionice -c3 `dirname $0 `/ds-cleanup.py
+EXITCODE=$?
+
+# Propagate the exit code of the flock/ds-backup invocation
+exit $EXITCODE
\ No newline at end of file
-- 
1.5.6.dirty



More information about the Server-devel mailing list