[PATCH] Introduce greplease, and use it to read on-disk lease files

Thu Aug 20 12:33:46 EDT 2009

Lease files provided on disk can be huge, as we have seen in
large deployments (at their repair centers). Reading and parsing
those in-memory is extraordinarily expensive. Rough tests show
RAM use for a parsed leases file to be about 5x its size.

So for 'format 1' JSON lease files we use the greplease.grep()
function that searches for the key->val combo inthe leases file
and returns the value -- which is handled by the 'singleton lease'
codeflow in find_leases()

If the lease is a singleton lease, then we read it straight away.

The greplease code has the option of using mmap if available. If we
get mmap in future initrds (~20KB) we can drop the read() based version
which is a tad convoluted.

---

This is a needed fix, but perhaps controversial. It tests well, but
sure needs review.

I have uploaded some sample large-ish sigfiles (padded with various things)
on http://dev.laptop.org/~martin/ - use 'cat-leases' to add your own leases
to it for testing.

According to Uruguay, their lease.sig was >100MB and XOs would die trying to
allocate ~500 MB for the parsed data structure.
 
---
 src/activate.py  |   40 +++++++++++----
 src/greplease.py |  149 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 178 insertions(+), 11 deletions(-)
 create mode 100755 src/greplease.py

diff --git a/src/activate.py b/src/activate.py
index bac8769..fa0447f 100644
--- a/src/activate.py
+++ b/src/activate.py
@@ -11,13 +11,31 @@ from subprocess import check_call, call
 from binascii import hexlify
 sys.path += [ '/act-gui' ] # gui_client is in a subdir
 from gui_client import send
+import greplease
 
-def try_blk(device, mnt, fstype='msdos'):
+def lease_from_file(fname, serial_num):
+    """Find the appropriate lease in a file that may be
+       a bare lease ("singleton") or a -- perhaps huge --
+       CJSON file.
+    """
+    fh = open(fname, 'r')
+    head = fh.read(5)
+    fh.close()
+    if head == '[1,{"':
+        # matches the start of a well-formed v1 leases file
+        # we use grep here to handle possibly huge lease files
+        # (in use at large deployments' repair centers)
+        return greplease.grep(fname, serial_num)
+    fh = open(fname, 'r')
+    fc = fh.read()
+    fh.close()
+    return fc
+
+def try_blk(device, mnt, serial_num, fstype='msdos'):
     """Try to mount a block device and read keylist from it."""
     try:
         with blk_mounted(device, mnt, fstype):
-            with open(os.path.join(mnt,'lease.sig')) as f:
-                return f.read()
+            return lease_from_file(os.path.join(mnt,'lease.sig'), serial_num)
     except:
         return None
 
@@ -328,9 +346,9 @@ def activate (serial_num, uuid):
         # check SD card. #####################
         send('SD start')
         sd_init()
-        keylist = try_blk('/dev/mmcblk0p1', SD_MNT)
+        keylist = try_blk('/dev/mmcblk0p1', SD_MNT, serial_num)
         if not keylist:
-            keylist = try_blk('/dev/mmcblk0', SD_MNT) # unpartitioned SD card
+            keylist = try_blk('/dev/mmcblk0', SD_MNT, serial_num) # unpartitioned SD card
         if keylist:
             send('SD success')
             try:
@@ -344,12 +362,12 @@ def activate (serial_num, uuid):
         # Check USB stick ####################
         send('USB start')
         usb_init()
-        if not keylist:
-            for suf in ['a1','a','b1','b','c1','c','b1','b','a1','a']:
-                keylist = try_blk('/dev/sd'+suf, USB_MNT)
-                if keylist: break
-                # some USB keys take a while to come up
-                time.sleep(1)
+        keylist = None
+        for suf in ['a1','a','b1','b','c1','c','b1','b','a1','a']:
+            keylist = try_blk('/dev/sd'+suf, USB_MNT, serial_num)
+            if keylist: break
+            # some USB keys take a while to come up
+            time.sleep(1)
         if keylist:
             send('USB success')
             try:
diff --git a/src/greplease.py b/src/greplease.py
new file mode 100755
index 0000000..6a1ef95
--- /dev/null
+++ b/src/greplease.py
@@ -0,0 +1,149 @@
+#!/usr/bin/python
+
+import re
+
+def grep_for_lease_mmap(fpath, sn):
+    """Search a potentially larger-than-mem cjson file for
+       something that looks like a lease or a series of leases.
+
+       Uses mmap.
+
+       returns a string or False
+       """
+    import mmap
+    fh = open(fpath, 'r')
+    m = mmap.mmap(fh.fileno(), 0, mmap.MAP_SHARED, mmap.PROT_READ)
+
+    # find the start of it
+    rx = re.compile('"'+sn+'":"')
+    objkey = rx.search(m)
+
+    if objkey:
+        # find the tail - the first non-escaped
+        # doublequotes. This relies on sigs not
+        # having escape chars themselves.
+        # TODO: Negative look-behind assertion to handle
+        # escaped values.
+        rx = re.compile('"')
+        objend = rx.search(m, objkey.end()) 
+
+    if objkey and objend:
+        found = m[objkey.end():objend.start()]
+    else:
+        found = False
+
+    m.close()
+    fh.close()
+    
+    return found
+
+def grep_for_lease_read(fpath, sn):
+    """Search a potentially larger-than-mem cjson file for
+       something that looks like a lease or a series of leases.
+
+       Uses old read()s
+
+       returns a string or False
+       """
+    # Use read()s, but keep stuff aligned to 4KB pages
+    # so we stand a chance to hit the fast paths.
+    page = 4096 #* 1024
+    step  = 0
+    cursor = 0
+
+    needle = '"'+sn+'":"'
+    needlerx = re.compile(needle)
+    needlelength = len(needle)
+
+    fh = open(fpath, 'r')
+
+    buf = ''
+    buftail = ''
+
+    while True:
+
+        buf = fh.read(page)
+        if (buf == ''): # EOF
+            break
+
+        buf = buftail + buf
+        
+        objkey = needlerx.search(buf)
+        if objkey:
+            # found the needle - issue a read
+            # from here and break
+            fh.seek( page * step + objkey.start() - len(buftail))
+            buf = fh.read(page)
+            # re-search for objkey - to get the offsets right
+            objkey = needlerx.search(buf)
+            break
+
+        # prep for next read - keep tail
+        # in case needle is on the boundary
+        buftail = buf[-needlelength:]
+        step = step+1
+        fh.seek( page * step )
+        #print " [ Seek to %s ]" % (page * step)
+
+    if objkey:
+        # find the tail - the first non-escaped
+        # doublequotes. This relies on sigs not
+        # having escape chars themselves.
+        # TODO: Negative look-behind assertion to handle
+        # escaped values.
+        rx = re.compile('"')
+        objend = rx.search(buf, objkey.end()) 
+
+    if objkey and objend:
+        found = buf[objkey.end():objend.start()]
+    else:
+        found = False
+
+    fh.close()
+    
+    return found
+
+def grep(fpath, sn):
+
+    hasmmap = True
+    try:
+        import mmap
+    except:
+        hasmmap = False
+
+    if hasmmap:
+        return grep_for_lease_mmap(fpath, sn)
+    else:
+        return grep_for_lease_read(fpath, sn)
+
+
+## sample test - work through a cjson file
+## based on the 'words' dict file, in reverse.
+## each word is key and value, with the value
+## having its capitalisation reversed.
+# import sys
+# fh = open(sys.argv[1])
+# bigdata = {}
+# lines = fh.readlines()
+# lines.reverse()
+# for k in lines:
+#     k = k.strip()
+#     print "Looking for %s" % k
+#     found = grep(sys.argv[2], k)
+#     if found:
+#         if found == k.swapcase():
+#             print "... found good match"
+#         else:
+#             print "BAD MATCH %s" % found
+#     else:
+#         print "NO MATCH"
+
+## Another sample test - args: filename, SN
+#import sys
+#found = grep(sys.argv[1], sys.argv[2])
+#
+#if found:
+#    print "Found:" + found
+#else:
+#    print 'not found'
+
-- 
1.6.0.6