1 files changed, 1035 insertions, 0 deletions
diff --git a/parse_datasheet/datasheet_tables.py b/parse_datasheet/datasheet_tables.py
new file mode 100644
index 0000000..778783a
--- /dev/null
+++ b/parse_datasheet/datasheet_tables.py
@@ -0,0 +1,1035 @@
+#!/usr/bin/python
+
+import xml.etree.cElementTree as ET
+from operator import attrgetter, itemgetter
+import re
+from collections import namedtuple
+from copy import copy
+
+DBG = 0
+
+def filter_pages(doc, min_page, max_page):
+    return [page
+            for page in doc.getroot().getchildren()
+            if min_page <= int(page.attrib['number']) <= max_page]
+
+
+def load_datasheet_pages(filepath, min_page, max_page):
+    doc = ET.parse(filepath)
+    return filter_pages(doc, min_page, max_page)
+
+
+ASCIIFY_TABLE = {
+    u'\u201c': '"',
+    u'\u201d': '"',
+    u'\u2019': "'",
+    u'\u2018': "'",
+    u'\u2013': "-",
+}
+
+def asciify(us):
+    result = [ASCIIFY_TABLE.get(a, a).encode('ascii', 'replace') for a in us]
+    return ''.join(result)
+
+
+class State(object):
+    __slots__ = ['name']
+    def __init__(self, name):
+        self.name = name
+    def __repr__(self):
+        return '<%s>' % self.name
+
+
+def _get_all_text_rec(elem, parts):
+    if elem.text:
+        parts.append(elem.text)
+    for c in elem.getchildren():
+        _get_all_text_rec(c, parts)
+        if c.tail:
+            parts.append(c.tail)
+    return parts
+
+def get_all_text(elem):
+    return u''.join(_get_all_text_rec(elem, []))
+
+
+TE_Id = namedtuple('TE_Id', 'text top left width height')
+
+class SomeText(object):
+    __slots__ = ['text', 'top', 'left', 'width', 'height', 'handled', 'special']
+    def __init__(self, text, top, left, width, height, special):
+        self.text = text
+        self.top = top
+        self.left = left
+        self.width = width
+        self.height = height
+        self.special = special
+        self.handled = False
+    @classmethod
+    def from_et(cls, elem):
+        return cls(get_all_text(elem),
+                   int(elem.attrib['top']),
+                   int(elem.attrib['left']),
+                   int(elem.attrib['width']),
+                   int(elem.attrib['height']),
+                   special=False)
+    @classmethod
+    def special_from_id(cls, i):
+        return cls(i.text, i.top, i.left, i.width, i.height, special=True)
+    def identity(self):
+        return TE_Id(self.text, self.top, self.left, self.width, self.height)
+    def __repr__(self):
+        return '<SomeText top=%r left=%r width=%r height=%r text=%r%s>' % \
+                (self.top, self.left, self.width, self.height, self.text,
+                 " special" if self.special else "")
+
+
+def dbg_fmt_hor_pos(hp):
+    return sorted(hp.items(), key=itemgetter(1))
+
+# In left to right order:
+# WARNING: this should not ever need to change, but if it does after all
+# you should carefully review this whole file...
+COL_RANGE = 0
+COL_ACRONYM = 1
+COL_DESCRIPTION = 2
+COL_STICKY = 3
+COL_RESET = 4
+COL_ACCESS = 5
+COL_NUMBER = 6
+
+CD = namedtuple('CD', 'width idx')
+
+class ColumnsPos(object):
+
+    Columns = {
+        'bit range':        CD(60, COL_RANGE),
+        'bit acronym':      CD(58, COL_ACRONYM),
+        'bit description':  CD(204, COL_DESCRIPTION),
+        'sticky':           CD(28, COL_STICKY),
+        'bit reset':        CD(58, COL_RESET),
+        'bit access':       CD(51, COL_ACCESS),
+    }
+
+    ADJUST = 10
+
+    def __init__(self):
+        self.hor_pos = dict([(k, None) for k in self.Columns.iterkeys()])
+        self.ver_pos = None
+        # DBG:
+        self.raw_hor_pos = dict(self.hor_pos)
+
+    def overlap(self, txt, left, right):
+        for k, a in self.hor_pos.iteritems():
+            if a and (left < a[1]) and not (right <= a[0]):
+                return k
+
+    def find_k_left(self, left):
+        for k, a in self.hor_pos.iteritems():
+            if a and (left - self.ADJUST <= a[1] <= left + self.ADJUST):
+                return k
+
+    def find_k_right(self, right):
+        for k, a in self.hor_pos.iteritems():
+            if a and (right - self.ADJUST <= a[0] <= right + self.ADJUST):
+                return k
+
+    def adjust(self, pos_a, cd_a, pos_b, cd_b):
+        w = cd_a.width + cd_b.width
+        # the longest can be adjusted more:
+        return ((pos_a * cd_b.width)    # move less if the other is longer
+                + (pos_b * cd_a.width)
+                + w/2) / w
+
+    def try_column_header(self, te, dbg=""):
+        txt = te.text.strip().lower()
+        if txt == 'bit range':
+            self.ver_pos = te.top
+        cd = self.Columns.get(txt)
+        if (not cd) and txt == 'value':
+            cd = self.Columns['bit reset']
+        if not cd:
+            return False
+
+        left = te.left + te.width / 2 - cd.width / 2
+        right = left + cd.width
+
+        if txt == 'value':
+            o_l, o_r = self.hor_pos['bit reset']
+            if not (o_l - self.ADJUST <= left <= o_l + self.ADJUST) \
+               or not (o_r - self.ADJUST <= right <= o_r + self.ADJUST):
+                print "WARNING: %r:" % dbg, \
+                      "'value' (%d, %d) not haligned with 'bit reset'" \
+                      % (left, right), \
+                      "### %r" % vars(self)
+            return True
+
+        # DBG:
+        self.raw_hor_pos[txt] = [left, right]
+
+        k_left = self.find_k_left(left)
+        if k_left:
+            near_left = self.hor_pos[k_left][1]
+            cd_left = self.Columns[k_left]
+            if cd_left.idx + 1 != cd.idx:
+                print "WARNING: %r:" % dbg, \
+                      "column left of '%s' should not be '%s'" % (txt, k_left)
+            self.hor_pos[k_left][1] = left = self.adjust(left, cd,
+                                                         near_left, cd_left)
+        k_right = self.find_k_right(right)
+        if k_right:
+            near_right = self.hor_pos[k_right][0]
+            cd_right = self.Columns[k_right]
+            if cd_right.idx - 1 != cd.idx:
+                print "WARNING: %r:" % dbg, \
+                      "column right of '%s' should not be '%s'" % (txt, k_right)
+            self.hor_pos[k_right][0] = right = self.adjust(right, cd,
+                                                           near_right, cd_right)
+
+        other_k = self.overlap(txt, left, right)
+        if other_k:
+            raise ValueError("column %r (%d, %d) overlaps with %r (%d, %d)"
+                             % (txt, left, right, other_k,
+                                self.hor_pos[other_k][0],
+                                self.hor_pos[other_k][1]))
+
+        self.hor_pos[txt] = [left, right]
+
+        return True
+
+    @staticmethod
+    def matching_level(l, r, l_ref, r_ref):
+        l_com = max(l, l_ref)
+        r_com = min(r, r_ref)
+        if l_com >= r_com:
+            return 0.0
+        else:
+            return (r_com - l_com) / float(r - l)
+
+    def search(self, left, right, min_match_level, out_level=None):
+        for k, (l_col, r_col) in self.hor_pos.iteritems():
+            level = self.matching_level(left, right, l_col, r_col)
+            if level >= min_match_level:
+                if out_level is not None:
+                   out_level[0] = level
+                return self.Columns[k].idx
+
+
+class Cell(object):
+    ADJ_X = 5
+    def __init__(self, column, col_name):
+        self.paragraphs = []
+        self.last_te = None
+        self.column = column
+        self.col_name = col_name
+    def is_empty(self):
+        return self.paragraphs == []
+    def insert_te(self, te):
+        txt = asciify(te.text)
+        if self.last_te \
+           and (te.top <= self.last_te.top + self.last_te.height):
+            self.paragraphs[-1] += txt
+        else:
+            self.paragraphs.append(txt)
+        self.last_te = te
+    def analyse(self):
+        self.analysis_warning = False
+    # post-analyse
+    def simple_lines(self):
+        res = [self.col_name]
+        res.extend([("\t" + p) for p in self.paragraphs])
+        return res
+
+
+class CellSymbol(Cell):
+    WARN_EMPTY = True
+    def analyse(self):
+        if self.is_empty():
+            self.content = ''
+            self.analysis_warning = self.WARN_EMPTY
+        else:
+            self.content = self.paragraphs[0].strip()
+            self.analysis_warning = (len(self.paragraphs) > 1)
+    def warn(self):
+        return "\t\t<warning>" if self.analysis_warning else ""
+    def simple_lines(self):
+        return ["%-13s %s%s" % (self.col_name, self.content, self.warn())]
+
+
+class CellRange(CellSymbol):
+    def _parse(self):
+        m = re.match(r"(\d+)[- :]+(\d+)", self.content)
+        if m:
+            self.range = (int(m.group(1)), int(m.group(2)))
+            return
+        try:
+            single_bit = int(self.content)
+            self.range = (single_bit, single_bit)
+            return
+        except ValueError:
+            pass
+    def analyse(self):
+        CellSymbol.analyse(self)
+        self.range = None
+        self._parse()
+        if self.range is None:
+            self.analysis_warning = True
+    def bits_width(self):
+        if self.range is None:
+            return None
+        return self.range[0] + 1 - self.range[1]
+    def str_range(self):
+        if self.range is None:
+            return 'None'
+        elif self.range[0] == self.range[1]:
+            return "%d" % self.range[0]
+        else:
+            return "%d-%d" % self.range
+    def simple_lines(self):
+        return ["%-13s %s%s" % (self.col_name, self.str_range(), self.warn())]
+
+
+class CellConstraint(CellSymbol):
+    # abstract class, derivate and add e.g.
+    #   ALLOWED = ('RO', 'RW')
+    def analyse(self):
+        CellSymbol.analyse(self)
+        if self.content not in self.ALLOWED:
+            self.analysis_warning = True
+
+
+class CellSticky(CellConstraint):
+    WARN_EMPTY = False # for upper layers
+    ALLOWED = ('Y', 'N', '')
+
+
+class CellAccess(CellConstraint):
+    ALLOWED = ('RW', 'RO', 'RWS',       # <= enough for smrbase
+               'RWC', 'RWO', 'RW0C',        # + imch_conf
+               'RWL',                       # + imch_conf
+               'RV', 'RCWC', 'WO', 'RC',    # + gbe
+               'RO/RWC')                    # + gbe
+
+
+def to_bin(i):
+    if i == 0:
+        return '0'
+    r = ""
+    while i:
+        r = str(i & 1) + r
+        i /= 2
+    return r
+
+class CellReset(CellSymbol):
+    def analyse(self):
+        CellSymbol.analyse(self)
+        self.value = None
+        self.base = None
+        self.bits = None
+        if self.content.endswith('b'):
+            self.base = 2
+        elif self.content.endswith('h'):
+            self.base = 16
+        if self.base:
+            try:
+                self.value = int(self.content[:-1], self.base)
+            except ValueError:
+                pass
+        if self.content in ('0', '1'):
+            self.value = int(self.content)
+            self.base = 2
+        if self.value is None:
+            self.analysis_warning = True
+    # post-analyse
+    def set_bits(self, bits):
+        self.bits = bits
+    def bin_repr(self):
+        if self.value is None:
+            return 'None'
+        bin = to_bin(self.value)
+        if self.bits is None:
+            return bin + 'b'
+        else:
+            return ('0' * (self.bits - len(bin))) + bin + 'b'
+    def hex_repr(self):
+        if self.value is None:
+            return 'None'
+        if self.bits is None:
+            return "%xh" % self.value
+        else:
+            nibs = (self.bits + 3) / 4
+            return ("%%0%dxh" % nibs) % self.value
+    def simple_repr(self):
+        if self.value is None:
+            return 'None'
+        bh = (self.bin_repr(), self.hex_repr())
+        if self.base == 2:
+            return "[ %s ]\t  %s  " % bh
+        else:
+            return "  %s  \t[ %s ]" % bh
+    def simple_lines(self):
+        return ["%-11s %s%s" % (self.col_name, self.simple_repr(), self.warn())]
+
+
+                                        # Line    PageTE
+INSERTED = State('INSERTED')            #   X       X
+NEW_LINE = State('NEW_LINE')            #   X
+INVALID_LINE = State('INVALID_LINE')    #   X
+NOT_HANDLED = State('NOT_HANDLED')      #           X
+END_OF_PAGE = State('END_OF_PAGE')      #           X
+
+VALID_TE_RANGE = re.compile(r'[-0-9 :]').match
+
+class Line(object):
+    def __init__(self, top):
+        # direct symbolic cell access
+        # prefixed by c_ cause i might also use unprefixed attrs in the
+        # future and directly put payloads there.
+        self.c_range =       CellRange  (COL_RANGE,         'range')
+        self.c_acronym =     CellSymbol (COL_ACRONYM,       'acronym')
+        self.c_description = Cell       (COL_DESCRIPTION,   'description')
+        self.c_sticky =      CellSticky (COL_STICKY,        'sticky')
+        self.c_reset =       CellReset  (COL_RESET,         'reset')
+        self.c_access =      CellAccess (COL_ACCESS,        'access')
+        # access by columns index:
+        self.cells = [
+            self.c_range,
+            self.c_acronym,
+            self.c_description,
+            self.c_sticky,
+            self.c_reset,
+            self.c_access,
+        ]
+        self.top = top
+        self.bottom = None
+        self.bits = None
+    def is_empty(self):
+        for c in self.cells:
+            if not c.is_empty():
+                return False
+        return True
+    def try_insert_te(self, column, te):
+        if (self.bottom is None) or (te.top < self.bottom):
+            # NOTE: this only works if the range cell contains only one line
+            if (self.bottom is None) and (column == COL_RANGE):
+                if not VALID_TE_RANGE(te.text):
+                    return INVALID_LINE
+                self.bottom = 2 * te.top - self.top
+            if (self.bottom is not None) \
+               and (te.top + te.height - 1 > self.bottom):
+                self.bottom = te.top + te.height - 1
+            self.cells[column].insert_te(te)
+            return INSERTED
+        else:
+            return NEW_LINE
+    def simple_print(self):
+        for c in self.cells:
+            for l in c.simple_lines():
+                print l
+    def value_print(self, value):
+        cell_value = copy(self.c_reset)
+        bit_base = self.c_range.range[1]
+        bit_mask = (1 << cell_value.bits) - 1
+        cell_value.value = (value >> bit_base) & bit_mask
+        cell_value.col_name = "value"
+        for c in self.cells[:2] + [cell_value] + self.cells[2:]:
+            for l in c.simple_lines():
+                print l
+    def analyse_line(self):
+        for c in self.cells:
+            c.analyse()
+        self.bits = self.c_range.bits_width()
+        self.c_reset.set_bits(self.bits)
+
+
+def strip_strict_ascii(s):
+    return s.strip().encode('ascii', 'strict')
+
+def parse_offset(s):
+    m = re.search(r"([0-9a-f]+)h(?:\s+at\s+([0-9a-f]+)h)? *$", s, re.I)
+    if m:
+        base, inc = m.groups()
+        base = int(base, 16)
+        if inc:
+            inc = int(inc, 16)
+        return base, inc
+    else:
+        raise ValueError("can't parse offset %r" % s)
+
+def format_offset(off):
+    if off is None:
+        return "-"
+    if isinstance(off, basestring):
+        return off
+    else:
+        return "%02Xh" % off
+
+def parse_size(s):
+    m = re.match(r"(\d+)\s+bit", s, re.I)
+    if m:
+        return int(m.group(1))
+    else:
+        raise ValueError("can't parse size %r" % s)
+
+def parse_default(s):
+    s = s.strip()
+    if not s.endswith('h'):
+        # raise ValueError("can't parse default value %r" % s)
+        return s
+    try:
+        return int(s[:-1], 16)
+    except ValueError:
+        if "X" in s:    # gruik hack
+            return s
+        raise
+
+def fmt_identity(t, x):
+    return x
+
+def fmt_hex(t, x):
+    return "%xh" % x
+
+def fmt_offset_start_end(t, x):
+    if x[1] is None:
+        return "%xh" % x[0]
+    else:
+        return "%xh at %xh" % x
+
+def fmt_default(t, x):
+    if isinstance(x, basestring):
+        return x
+    nibs = (t.size + 3) / 4
+    return ("%%0%dxh" % nibs) % x
+
+
+FD = namedtuple('FD', 'attr_name adj_y adj_x parser')
+
+class Table(object):
+
+    # WARNING:
+    # offset_end often contains random information in recurring registers
+
+    #                                     attr_name   adj_y adj_x parser
+    Fields = { 'Description:':          FD('description', 2, 5, strip_strict_ascii),
+               'View:':                 FD('view', 2, 5, strip_strict_ascii),
+               'BAR:':                  FD('bar', 2, 5, strip_strict_ascii),
+               'Offset Start:':         FD('offset_start', 2, 5, parse_offset),
+               'Offset End:':           FD('offset_end', 2, 5, parse_offset),
+               'Power Well:':           FD('power_well', 2, 5, strip_strict_ascii),
+               'Size:':                 FD('size', 2, 5, parse_size),
+               'Default:':              FD('default', 2, 5, parse_default),
+               'Bus:Device:Function:':  FD('bus_device_function', 2, 5, strip_strict_ascii) }
+
+    HeaderAttr = [
+        # from table title:
+        ('table_ref',           fmt_identity),
+        ('offset',              fmt_hex),
+        ('reg_name',            fmt_identity),
+        ('recurring',           fmt_identity),
+        ('reg_base_name',       fmt_identity),
+        ('title_desc',          fmt_identity),
+
+        # from table header:
+        ('description',         fmt_identity),
+        ('view',                fmt_identity),
+        ('bar',                 fmt_identity),
+        ('offset_start',        fmt_offset_start_end),
+        ('offset_end',          fmt_offset_start_end),
+        ('power_well',          fmt_identity),
+        ('size',                fmt_identity),
+        ('default',             fmt_default),
+        ('bus_device_function', fmt_identity),
+    ]
+    HeaderAttrDict = dict(HeaderAttr)
+
+    START_LINE_MAX_HEIGHT = 11      # max interval _between_ title lines
+    COL_HEADER_HEIGHT = 26
+    def __init__(self, chapter, table_num, offset, reg_name, title_desc):
+        self.chapter = chapter
+        self.table_num = table_num
+        self.offset = offset
+        self.reg_name = reg_name
+        self.title_desc = title_desc
+        self.table_ref = "%d-%d" % (self.chapter, self.table_num)
+        #
+        self.recurring, self.reg_base_name = self.parse_reg_name(reg_name)
+        #
+        self.description = None
+        self.view = None
+        self.bar = None
+        self.offset_start = None
+        self.offset_end = None
+        self.power_well = None
+        self.size = None
+        self.default = None
+        self.bus_device_function = None
+        #
+        self.columns = None
+        self.cur_line = None
+        self.lines = []
+    @staticmethod
+    def parse_reg_name(reg_name):
+        m = re.match(r"(.*?)[[](\d+)[-:](\d+)[]]", reg_name)
+        if not m:
+            return None, reg_name
+        reg_base_name, left, right = m.groups()
+        left = int(left)
+        right = int(right)
+        low = min(left, right)
+        high = max(left, right)
+        if low != 0:
+            print ("WARNING: don't know how to handle recurring register %s "
+                   "not starting at index 0. Loading as non recurring for now."
+                   % reg_name)
+            return None, reg_base_name
+        return high + 1, reg_base_name
+    def attrs_set_abstract(self):
+        del self.columns
+        del self.cur_line
+    def clone_abstract(self):
+        """
+        Clone the abstract part of this table, that is everything needed for
+        extraction of payload informations.
+        result.columns and result.cur_line are deleted.
+        self.lines is shallow copied. => result.lines = self.lines[:]
+        """
+        result = copy(self)
+        result.attrs_set_abstract()
+        result.lines = self.lines[:]
+        return result
+    def extend_abstract(self, other):
+        """
+        Once a first 'concrete' table has been cloned to an abstract version,
+        it can be extended with other parts of the same table (from subsequent
+        pages). Example if a table is splitted between (end of) page p_1 and 
+        (start of page) p_2:
+        >>> pte_1 = PageTE(p_1, profile)
+        >>> pte_2 = PageTE(p_2, profile)
+        >>> full_table = pte_1.tables[-1].clone_abstract()
+        >>> full_table.extend_abstract(pte_2.tables[0])
+        """
+        self.lines.extend(other.lines)
+    def header_mismatch(self, other):
+        return [
+            (attr, getattr(self, attr), getattr(other, attr))
+            for (attr, _) in self.HeaderAttr
+            if getattr(self, attr) != getattr(other, attr)
+        ]
+    def use_columns_pos(self):
+        self.columns = ColumnsPos()
+    def cat_start_line(self, text):
+        self.title_desc += text.encode('ascii', 'strict')
+    def set_field_parse(self, fd, s):
+        try:
+            setattr(self, fd.attr_name, fd.parser(s))
+        except:
+            print "exception while parsing %r" % vars(self)
+            raise
+    def close(self):
+        self.title_desc = re.sub(r'\s*\(Sheet .* of .*\)\s*', ' ',
+                                 self.title_desc).strip()
+        self.cur_line = None
+    def start_content(self):
+        self.cur_line = Line(self.columns.ver_pos + self.COL_HEADER_HEIGHT / 2)
+        self.lines.append(self.cur_line)
+    def insert_content_te(self, column, te):
+        res = self.cur_line.try_insert_te(column, te)
+        if res == INSERTED:
+            return True
+        elif res == NEW_LINE:
+            if self.cur_line.bottom is None:
+                next_top = te.top + 1 - te.height
+            else:
+                next_top = self.cur_line.bottom
+            self.cur_line = Line(next_top)
+            self.lines.append(self.cur_line)
+            self.cur_line.try_insert_te(column, te)
+            return True
+        else: # INVALID_LINE
+            self.lines = self.lines[:-1]
+            if self.lines:
+                self.cur_line = self.lines[-1]
+            else:
+                self.cur_line = None
+            return False
+    def analyse_content(self):
+        for l in self.lines:
+            l.analyse_line()
+    def check_ranges(self):
+        rembits = self.size
+        for l in self.lines:
+            if (l.c_range.range is None) or (rembits != l.c_range.range[0] + 1):
+                return False
+            rembits = l.c_range.range[1]
+        return rembits == 0
+    def auto_offset(self):
+        if self.offset is None:
+            self.offset = self.offset_start[0]
+    def fmt(self, field_name):
+        fmt_func = self.HeaderAttrDict[field_name]
+        return fmt_func(self, getattr(self, field_name))
+    def value_print(self, value):
+        print '============================================================'
+        print '%xh' % self.offset, self.reg_name
+        for h, fmt in self.HeaderAttr:
+            print "  %-15s %s" % (h, fmt(self, getattr(self, h)))
+        print
+        for l in self.lines:
+            print " ----------------"
+            l.value_print(value)
+            print
+        print
+
+
+class ProfileBase(object):
+    """
+    To properly parse register tables from the informally written 320066
+    datasheet, it is typically needed to perform some adjustments during
+    the dispatching of text elements, and/or after the parse stage itself
+    on abstracted tables. Also some tables need to be blacklisted, and the
+    page range to be defined.
+
+    Derive this class to implement such a register set profile,
+    then register it by calling register_profile(derived_class)
+    """
+
+    name = None         # string - symbolic name of the register set
+    min_page = None     # int - first page to parse (included)
+    max_page = None     # int - last page to parse (included)
+
+    @staticmethod
+    def special_te_mapper(ps, te, table):
+        """
+        This method can be used to help implement the .special_replace() method.
+        ps: Parse State - instance of PageTE.ParseState
+        te: Text Element - instance of SomeText
+        table: replacement (hash) table, ex:
+         {
+          (432, TE_Id(text=u'N 00001b', top=384, left=409, width=60, height=8)):
+          [TE_Id(text=u'N', top=384, left=409, width=4, height=8),
+           TE_Id(text=u'00001b', top=384, left=427, width=50, height=8)],
+         }
+        The key is (page_num, te_id) where te_id describes the original text
+        element which could not be fitted in a table cell.
+        The value is a list of TE_Id, to replace the offending one.
+        """
+        key = (ps.pte.page_num, te.identity())
+        repl_ids = table.get(key)
+        if repl_ids is not None:
+            return [SomeText.special_from_id(p) for p in repl_ids]
+
+    def special_replace(self, ps, te):
+        return None
+
+    def post_fix(self, a_t):
+        pass
+
+    def table_blacklisted(self, chapter, table_num):
+        """
+        chapter: int
+        table_num: int
+        """
+        return False
+
+
+_profiles = {}
+
+def register_profile(profile_class):
+    if not isinstance(profile_class.name, basestring):
+        raise ValueError("profile_class should be a string")
+    _profiles[profile_class.name] = profile_class
+
+
+def profile_factory(profile_name):
+    if profile_name not in _profiles:
+        raise ValueError("profile %s does not exist" % profile_name)
+    profile_class = _profiles[profile_name]
+    return profile_class()
+
+
+# Unhandled Description
+UD = namedtuple('UD', 'te table_ref reg_name')
+
+ParsedTitle = namedtuple('ParsedTitle',
+                         'chapter table_num offset reg_name title_desc')
+
+class PageTE(object):
+
+    class ParseState(object):
+        """
+        This contains attributes that are shared between PageTE methods
+        during the parse stage, but really conceptually are local vars.
+        I put them all here because it's easier to get ride of them all
+        like that, and also less error prone. Plus they are clearly
+        identified, so user code won't try to do stupid things and hopefully
+        the mess in PageTE will be kept limited.
+        """
+        NOP = State('NOP')
+        TITLE = State('TITLE')
+        HEADER = State('HEADER')
+        COL_HEADER = State('COL_HEADER')
+        CONTENT = State('CONTENT')
+        def __init__(self, pte, et_page, profile):
+            # tels: text elements
+            # curt: current table
+            # st: state (one of the above)
+            #
+            self.unhandled = {# no NOP
+                              self.TITLE: [],
+                              self.HEADER: [],
+                              self.COL_HEADER: [],
+                              self.CONTENT: []}
+            self.pte = pte
+            self.tels = [SomeText.from_et(elem)
+                         for elem in et_page.findall('text')]
+            self.tels.sort(key=lambda elem: (elem.top, elem.left))
+            self.tels_X = [[] for i in xrange(self.pte.page_width)]
+            self.tels_Y = [[] for i in xrange(self.pte.page_height)]
+            for te in self.tels:
+                self.tels_X[te.left].append(te)
+                self.tels_Y[te.top].append(te)
+            self.curt = None
+            self.st = self.NOP
+            self.title_y = None
+            self.profile = profile
+        def close_curt(self):
+            if self.curt:
+                self.curt.close()
+                self.curt = None
+            self.st = self.NOP
+        def find_te_around(self, y, x, ay, ax):
+            y_min = max(0, y - ay)
+            y_max = min(self.pte.page_height, y + ay + 1)
+            x_min = max(0, x - ax)
+            x_max = min(self.pte.page_width, x + ax + 1)
+            for tel_line in self.tels_Y[y_min:y_max]:
+                for te in tel_line:
+                    if x_min <= te.left < x_max:
+                        return te
+
+    WARN_EMPTY = False
+
+    MIN_MATCH_COL = 0.9
+
+    # any text under this point marks and end of page
+    PAGE_BOTTOM = 732           # gruik hack
+    TABLE_CHAR_MAX_SIZE = 10    # quite gruik too
+
+    start_search = re.compile(ur"""
+        Table\s+(\d+)-(\d+)\s*\.\s*                 # <chapter>-<table_num>
+        (?:Offset\s+([0-9a-f]+)h\s*:\s*)?           # <offset>
+        (\S+?(?:[[]\S+?[]])?)(?:\s?[-\u2013:])+     # <reg_name>
+        (.*)$                                       # <title_desc>
+    """, re.I | re.VERBOSE).search
+
+    def parse_table_title(self, ps, text):
+        m = self.start_search(text)
+        if not m:
+            return None
+        chapter = int(m.group(1))
+        table_num = int(m.group(2))
+        offset_group = m.group(3)
+        offset = (offset_group and int(offset_group, 16))
+        if ps.profile.table_blacklisted(chapter, table_num):
+            return None
+        return ParsedTitle(
+                    chapter=chapter,
+                    table_num=table_num,
+                    offset=offset,
+                    reg_name=m.group(4).encode('ascii', 'strict'),
+                    title_desc=m.group(5).encode('ascii', 'strict'))
+
+    @staticmethod
+    def PteTable(parsed_title):
+        pt = Table(parsed_title.chapter,
+                   parsed_title.table_num,
+                   parsed_title.offset,
+                   parsed_title.reg_name,
+                   parsed_title.title_desc)
+        pt.use_columns_pos()
+        return pt
+
+    # note: TEs are usually sorted by (y, x), but not strictly
+    # in case of a special case.  Don't do insane things.
+    def _process_te(self, ps, te):
+        if te.top >= self.PAGE_BOTTOM:
+            return END_OF_PAGE
+        parsed_title = self.parse_table_title(ps, te.text)
+        if parsed_title:
+            ps.close_curt()
+            ps.curt = self.PteTable(parsed_title)
+            self.tables.append(ps.curt)
+            ps.title_y = te.top
+            ps.st = ps.TITLE
+            return INSERTED
+        if (ps.st in (ps.TITLE, ps.HEADER)) and (te.text in Table.Fields):
+            ps.st = ps.HEADER
+            fd = Table.Fields[te.text]
+            ote = ps.find_te_around(te.top,
+                                    te.left + te.width,
+                                    fd.adj_y, fd.adj_x)
+            if ote:
+                ote.handled = True
+                ps.curt.set_field_parse(fd, ote.text)
+            else:
+                if self.WARN_EMPTY:
+                    print ("WARNING: page %d: empty field %r in "
+                           "Table %s Offset %s: %s - %s") % \
+                           (self.page_num, te.text,
+                            ps.curt.table_ref, format_offset(ps.curt.offset),
+                            ps.curt.reg_name, ps.curt.title_desc)
+            return INSERTED
+        if ps.st == ps.TITLE:
+            if te.top < ps.title_y + ps.curt.START_LINE_MAX_HEIGHT:
+                ps.curt.cat_start_line(te.text)
+                ps.title_y = te.top
+                return INSERTED
+            # else nothing
+        if ps.st in (ps.HEADER, ps.COL_HEADER):
+            r = ps.curt.columns.try_column_header(te,
+                    # info for error traces:
+                    {'page': self.page_num,
+                     'table': ps.curt.table_ref})
+            if r:
+                ps.st = ps.COL_HEADER
+                return INSERTED
+            elif ps.st == ps.COL_HEADER:
+                ps.curt.start_content()
+                ps.st = ps.CONTENT
+                if DBG:
+                    print "RAW", ps.curt.table_ref, ps.curt.reg_name, ps.curt.columns.ver_pos, dbg_fmt_hor_pos(ps.curt.columns.raw_hor_pos)
+                    print "POS", ps.curt.table_ref, ps.curt.reg_name, ps.curt.columns.ver_pos, dbg_fmt_hor_pos(ps.curt.columns.hor_pos)
+        if ps.st == ps.CONTENT:
+            if te.height > self.TABLE_CHAR_MAX_SIZE:
+                ps.close_curt()
+                return INSERTED
+            lvl = [None]
+            column = ps.curt.columns.search(te.left, te.left + te.width,
+                                            self.MIN_MATCH_COL, lvl)
+            if DBG:
+                print "III", ps.curt.table_ref, ps.curt.reg_name, lvl[0], te
+            if column is not None:
+                # XXX log dismissed one that do not close the table?
+                # need to add a special return value to insert_content_te
+                # to do that
+                if not ps.curt.insert_content_te(column, te):
+                    ps.close_curt()
+                return INSERTED
+            elif not te.special:
+                spe_tels = self.try_special(ps, te)
+                if spe_tels is not None:
+                    if not spe_tels:
+                        return INSERTED
+                    result = NOT_HANDLED
+                    for spe_te in spe_tels:
+                        spe_r = self.process_te(ps, spe_te)
+                        if spe_r == END_OF_PAGE:
+                            return END_OF_PAGE
+                        elif spe_r == INSERTED:
+                            result = INSERTED
+                    if result == INSERTED:
+                        return INSERTED
+        if ps.st in ps.unhandled:
+            ps.unhandled[ps.st].append(UD(te, ps.curt.table_ref,
+                                          ps.curt.reg_name))
+        return NOT_HANDLED
+
+    def process_te(self, ps, te):
+        """
+        factors out handling of te.handled from _process_te
+        """
+        if te.handled:
+            return INSERTED
+        r = self._process_te(ps, te)
+        if r != NOT_HANDLED:
+            te.handled = True
+        return r
+
+    def try_special(self, ps, te):
+        """
+        Returns None if this te is not to be handled as a special case.
+        Else returns a list of te for which insertion should be tried.
+        IMPORTANT:
+        It is important that each of these te have its 'special' attribute
+        set to True so that infinite recursion is reliably avoided.
+        (special te won't result in a call to try_special if they are
+         not handled)
+        """
+        return ps.profile.special_replace(ps, te)
+
+    def __init__(self, et_page, profile):
+        self.page_num = int(et_page.attrib['number'])
+        if DBG:
+            print "PAGE", self.page_num
+
+        self.page_height = int(et_page.attrib['height'])
+        self.page_width = int(et_page.attrib['width'])
+
+        self.tables = []
+
+        ps = self.ParseState(self, et_page, profile)
+
+        for te in ps.tels:
+            if self.process_te(ps, te) == END_OF_PAGE:
+                break
+        ps.close_curt()
+
+        for state, unh_tels in ps.unhandled.iteritems():
+            for ud in unh_tels:
+                if not ud.te.handled:
+                    print "WWW unhandled %r   ### %r" % \
+                          ((self.page_num, ud.te.identity()),
+                           (ud.table_ref, ud.reg_name, state))
+
+
+class AbsTables(object):
+    def __init__(self):
+        self.tables = []
+        self.by_ref = {}
+        self.by_base_name = {}
+    def append_table(self, t):
+        if t.table_ref not in self.by_ref:
+            if t.reg_base_name in self.by_base_name:
+                raise ValueError("already have a table (ref: %s) "
+                                 "for register %s" %
+                                 (self.by_base_name[t.reg_base_name].table_ref,
+                                  t.reg_base_name))
+            new_abs_t = t.clone_abstract()
+            self.tables.append(new_abs_t)
+            self.by_ref[t.table_ref] = new_abs_t
+            self.by_base_name[t.reg_base_name] = new_abs_t
+        else:
+            ml = self.by_ref[t.table_ref].header_mismatch(t)
+            if ml:
+                print "MISMATCH %s:" % t.table_ref, ml
+            self.by_ref[t.table_ref].extend_abstract(t)
+    def find_by_base_name(self, base_name):
+        return self.by_base_name.get(base_name)
+    def find_by_offset(self, offset):
+        for t in self.tables:
+            if offset == t.offset:
+                return t
+            if t.recurring and (offset > t.offset):
+                delta = offset - t.offset
+                if (delta % t.offset_start[1]) == 0 \
+                   and (delta / t.offset_start[1]) < t.recurring:
+                    return t
+    # for use in post_fix
+    def auto_tables_offset(self):
+        for t in self.tables:
+            t.auto_offset()
+
+
+def abs_tables_from_pages(pages, profile):
+    """
+    Simple wrapper to concatenate and analyse tables from loaded pages.
+    """
+    a_t = AbsTables()
+    for p in pages:
+        pte = PageTE(p, profile)
+        for t in pte.tables:
+            a_t.append_table(t)
+    for t in a_t.tables:
+        t.analyse_content()
+    profile.post_fix(a_t)
+    return a_t