[PATCH] Introduce greplease, and use it to read on-disk lease files
Martin Langhoff
martin at laptop.org
Thu Aug 20 12:33:46 EDT 2009
Lease files provided on disk can be huge, as we have seen in
large deployments (at their repair centers). Reading and parsing
those in-memory is extraordinarily expensive. Rough tests show
RAM use for a parsed leases file to be about 5x its size.
So for 'format 1' JSON lease files we use the greplease.grep()
function that searches for the key->val combo inthe leases file
and returns the value -- which is handled by the 'singleton lease'
codeflow in find_leases()
If the lease is a singleton lease, then we read it straight away.
The greplease code has the option of using mmap if available. If we
get mmap in future initrds (~20KB) we can drop the read() based version
which is a tad convoluted.
---
This is a needed fix, but perhaps controversial. It tests well, but
sure needs review.
I have uploaded some sample large-ish sigfiles (padded with various things)
on http://dev.laptop.org/~martin/ - use 'cat-leases' to add your own leases
to it for testing.
According to Uruguay, their lease.sig was >100MB and XOs would die trying to
allocate ~500 MB for the parsed data structure.
---
src/activate.py | 40 +++++++++++----
src/greplease.py | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 178 insertions(+), 11 deletions(-)
create mode 100755 src/greplease.py
diff --git a/src/activate.py b/src/activate.py
index bac8769..fa0447f 100644
--- a/src/activate.py
+++ b/src/activate.py
@@ -11,13 +11,31 @@ from subprocess import check_call, call
from binascii import hexlify
sys.path += [ '/act-gui' ] # gui_client is in a subdir
from gui_client import send
+import greplease
-def try_blk(device, mnt, fstype='msdos'):
+def lease_from_file(fname, serial_num):
+ """Find the appropriate lease in a file that may be
+ a bare lease ("singleton") or a -- perhaps huge --
+ CJSON file.
+ """
+ fh = open(fname, 'r')
+ head = fh.read(5)
+ fh.close()
+ if head == '[1,{"':
+ # matches the start of a well-formed v1 leases file
+ # we use grep here to handle possibly huge lease files
+ # (in use at large deployments' repair centers)
+ return greplease.grep(fname, serial_num)
+ fh = open(fname, 'r')
+ fc = fh.read()
+ fh.close()
+ return fc
+
+def try_blk(device, mnt, serial_num, fstype='msdos'):
"""Try to mount a block device and read keylist from it."""
try:
with blk_mounted(device, mnt, fstype):
- with open(os.path.join(mnt,'lease.sig')) as f:
- return f.read()
+ return lease_from_file(os.path.join(mnt,'lease.sig'), serial_num)
except:
return None
@@ -328,9 +346,9 @@ def activate (serial_num, uuid):
# check SD card. #####################
send('SD start')
sd_init()
- keylist = try_blk('/dev/mmcblk0p1', SD_MNT)
+ keylist = try_blk('/dev/mmcblk0p1', SD_MNT, serial_num)
if not keylist:
- keylist = try_blk('/dev/mmcblk0', SD_MNT) # unpartitioned SD card
+ keylist = try_blk('/dev/mmcblk0', SD_MNT, serial_num) # unpartitioned SD card
if keylist:
send('SD success')
try:
@@ -344,12 +362,12 @@ def activate (serial_num, uuid):
# Check USB stick ####################
send('USB start')
usb_init()
- if not keylist:
- for suf in ['a1','a','b1','b','c1','c','b1','b','a1','a']:
- keylist = try_blk('/dev/sd'+suf, USB_MNT)
- if keylist: break
- # some USB keys take a while to come up
- time.sleep(1)
+ keylist = None
+ for suf in ['a1','a','b1','b','c1','c','b1','b','a1','a']:
+ keylist = try_blk('/dev/sd'+suf, USB_MNT, serial_num)
+ if keylist: break
+ # some USB keys take a while to come up
+ time.sleep(1)
if keylist:
send('USB success')
try:
diff --git a/src/greplease.py b/src/greplease.py
new file mode 100755
index 0000000..6a1ef95
--- /dev/null
+++ b/src/greplease.py
@@ -0,0 +1,149 @@
+#!/usr/bin/python
+
+import re
+
+def grep_for_lease_mmap(fpath, sn):
+ """Search a potentially larger-than-mem cjson file for
+ something that looks like a lease or a series of leases.
+
+ Uses mmap.
+
+ returns a string or False
+ """
+ import mmap
+ fh = open(fpath, 'r')
+ m = mmap.mmap(fh.fileno(), 0, mmap.MAP_SHARED, mmap.PROT_READ)
+
+ # find the start of it
+ rx = re.compile('"'+sn+'":"')
+ objkey = rx.search(m)
+
+ if objkey:
+ # find the tail - the first non-escaped
+ # doublequotes. This relies on sigs not
+ # having escape chars themselves.
+ # TODO: Negative look-behind assertion to handle
+ # escaped values.
+ rx = re.compile('"')
+ objend = rx.search(m, objkey.end())
+
+ if objkey and objend:
+ found = m[objkey.end():objend.start()]
+ else:
+ found = False
+
+ m.close()
+ fh.close()
+
+ return found
+
+def grep_for_lease_read(fpath, sn):
+ """Search a potentially larger-than-mem cjson file for
+ something that looks like a lease or a series of leases.
+
+ Uses old read()s
+
+ returns a string or False
+ """
+ # Use read()s, but keep stuff aligned to 4KB pages
+ # so we stand a chance to hit the fast paths.
+ page = 4096 #* 1024
+ step = 0
+ cursor = 0
+
+ needle = '"'+sn+'":"'
+ needlerx = re.compile(needle)
+ needlelength = len(needle)
+
+ fh = open(fpath, 'r')
+
+ buf = ''
+ buftail = ''
+
+ while True:
+
+ buf = fh.read(page)
+ if (buf == ''): # EOF
+ break
+
+ buf = buftail + buf
+
+ objkey = needlerx.search(buf)
+ if objkey:
+ # found the needle - issue a read
+ # from here and break
+ fh.seek( page * step + objkey.start() - len(buftail))
+ buf = fh.read(page)
+ # re-search for objkey - to get the offsets right
+ objkey = needlerx.search(buf)
+ break
+
+ # prep for next read - keep tail
+ # in case needle is on the boundary
+ buftail = buf[-needlelength:]
+ step = step+1
+ fh.seek( page * step )
+ #print " [ Seek to %s ]" % (page * step)
+
+ if objkey:
+ # find the tail - the first non-escaped
+ # doublequotes. This relies on sigs not
+ # having escape chars themselves.
+ # TODO: Negative look-behind assertion to handle
+ # escaped values.
+ rx = re.compile('"')
+ objend = rx.search(buf, objkey.end())
+
+ if objkey and objend:
+ found = buf[objkey.end():objend.start()]
+ else:
+ found = False
+
+ fh.close()
+
+ return found
+
+def grep(fpath, sn):
+
+ hasmmap = True
+ try:
+ import mmap
+ except:
+ hasmmap = False
+
+ if hasmmap:
+ return grep_for_lease_mmap(fpath, sn)
+ else:
+ return grep_for_lease_read(fpath, sn)
+
+
+## sample test - work through a cjson file
+## based on the 'words' dict file, in reverse.
+## each word is key and value, with the value
+## having its capitalisation reversed.
+# import sys
+# fh = open(sys.argv[1])
+# bigdata = {}
+# lines = fh.readlines()
+# lines.reverse()
+# for k in lines:
+# k = k.strip()
+# print "Looking for %s" % k
+# found = grep(sys.argv[2], k)
+# if found:
+# if found == k.swapcase():
+# print "... found good match"
+# else:
+# print "BAD MATCH %s" % found
+# else:
+# print "NO MATCH"
+
+## Another sample test - args: filename, SN
+#import sys
+#found = grep(sys.argv[1], sys.argv[2])
+#
+#if found:
+# print "Found:" + found
+#else:
+# print 'not found'
+
--
1.6.0.6
More information about the Devel
mailing list