summaryrefslogtreecommitdiff
path: root/parse_datasheet/datasheet_tables.py
diff options
context:
space:
mode:
Diffstat (limited to 'parse_datasheet/datasheet_tables.py')
-rw-r--r--parse_datasheet/datasheet_tables.py1035
1 files changed, 1035 insertions, 0 deletions
diff --git a/parse_datasheet/datasheet_tables.py b/parse_datasheet/datasheet_tables.py
new file mode 100644
index 0000000..778783a
--- /dev/null
+++ b/parse_datasheet/datasheet_tables.py
@@ -0,0 +1,1035 @@
+#!/usr/bin/python
+
+import xml.etree.cElementTree as ET
+from operator import attrgetter, itemgetter
+import re
+from collections import namedtuple
+from copy import copy
+
+DBG = 0
+
+def filter_pages(doc, min_page, max_page):
+ return [page
+ for page in doc.getroot().getchildren()
+ if min_page <= int(page.attrib['number']) <= max_page]
+
+
+def load_datasheet_pages(filepath, min_page, max_page):
+ doc = ET.parse(filepath)
+ return filter_pages(doc, min_page, max_page)
+
+
+ASCIIFY_TABLE = {
+ u'\u201c': '"',
+ u'\u201d': '"',
+ u'\u2019': "'",
+ u'\u2018': "'",
+ u'\u2013': "-",
+}
+
+def asciify(us):
+ result = [ASCIIFY_TABLE.get(a, a).encode('ascii', 'replace') for a in us]
+ return ''.join(result)
+
+
+class State(object):
+ __slots__ = ['name']
+ def __init__(self, name):
+ self.name = name
+ def __repr__(self):
+ return '<%s>' % self.name
+
+
+def _get_all_text_rec(elem, parts):
+ if elem.text:
+ parts.append(elem.text)
+ for c in elem.getchildren():
+ _get_all_text_rec(c, parts)
+ if c.tail:
+ parts.append(c.tail)
+ return parts
+
+def get_all_text(elem):
+ return u''.join(_get_all_text_rec(elem, []))
+
+
+TE_Id = namedtuple('TE_Id', 'text top left width height')
+
+class SomeText(object):
+ __slots__ = ['text', 'top', 'left', 'width', 'height', 'handled', 'special']
+ def __init__(self, text, top, left, width, height, special):
+ self.text = text
+ self.top = top
+ self.left = left
+ self.width = width
+ self.height = height
+ self.special = special
+ self.handled = False
+ @classmethod
+ def from_et(cls, elem):
+ return cls(get_all_text(elem),
+ int(elem.attrib['top']),
+ int(elem.attrib['left']),
+ int(elem.attrib['width']),
+ int(elem.attrib['height']),
+ special=False)
+ @classmethod
+ def special_from_id(cls, i):
+ return cls(i.text, i.top, i.left, i.width, i.height, special=True)
+ def identity(self):
+ return TE_Id(self.text, self.top, self.left, self.width, self.height)
+ def __repr__(self):
+ return '<SomeText top=%r left=%r width=%r height=%r text=%r%s>' % \
+ (self.top, self.left, self.width, self.height, self.text,
+ " special" if self.special else "")
+
+
+def dbg_fmt_hor_pos(hp):
+ return sorted(hp.items(), key=itemgetter(1))
+
+# In left to right order:
+# WARNING: this should not ever need to change, but if it does after all
+# you should carefully review this whole file...
+COL_RANGE = 0
+COL_ACRONYM = 1
+COL_DESCRIPTION = 2
+COL_STICKY = 3
+COL_RESET = 4
+COL_ACCESS = 5
+COL_NUMBER = 6
+
+CD = namedtuple('CD', 'width idx')
+
+class ColumnsPos(object):
+
+ Columns = {
+ 'bit range': CD(60, COL_RANGE),
+ 'bit acronym': CD(58, COL_ACRONYM),
+ 'bit description': CD(204, COL_DESCRIPTION),
+ 'sticky': CD(28, COL_STICKY),
+ 'bit reset': CD(58, COL_RESET),
+ 'bit access': CD(51, COL_ACCESS),
+ }
+
+ ADJUST = 10
+
+ def __init__(self):
+ self.hor_pos = dict([(k, None) for k in self.Columns.iterkeys()])
+ self.ver_pos = None
+ # DBG:
+ self.raw_hor_pos = dict(self.hor_pos)
+
+ def overlap(self, txt, left, right):
+ for k, a in self.hor_pos.iteritems():
+ if a and (left < a[1]) and not (right <= a[0]):
+ return k
+
+ def find_k_left(self, left):
+ for k, a in self.hor_pos.iteritems():
+ if a and (left - self.ADJUST <= a[1] <= left + self.ADJUST):
+ return k
+
+ def find_k_right(self, right):
+ for k, a in self.hor_pos.iteritems():
+ if a and (right - self.ADJUST <= a[0] <= right + self.ADJUST):
+ return k
+
+ def adjust(self, pos_a, cd_a, pos_b, cd_b):
+ w = cd_a.width + cd_b.width
+ # the longest can be adjusted more:
+ return ((pos_a * cd_b.width) # move less if the other is longer
+ + (pos_b * cd_a.width)
+ + w/2) / w
+
+ def try_column_header(self, te, dbg=""):
+ txt = te.text.strip().lower()
+ if txt == 'bit range':
+ self.ver_pos = te.top
+ cd = self.Columns.get(txt)
+ if (not cd) and txt == 'value':
+ cd = self.Columns['bit reset']
+ if not cd:
+ return False
+
+ left = te.left + te.width / 2 - cd.width / 2
+ right = left + cd.width
+
+ if txt == 'value':
+ o_l, o_r = self.hor_pos['bit reset']
+ if not (o_l - self.ADJUST <= left <= o_l + self.ADJUST) \
+ or not (o_r - self.ADJUST <= right <= o_r + self.ADJUST):
+ print "WARNING: %r:" % dbg, \
+ "'value' (%d, %d) not haligned with 'bit reset'" \
+ % (left, right), \
+ "### %r" % vars(self)
+ return True
+
+ # DBG:
+ self.raw_hor_pos[txt] = [left, right]
+
+ k_left = self.find_k_left(left)
+ if k_left:
+ near_left = self.hor_pos[k_left][1]
+ cd_left = self.Columns[k_left]
+ if cd_left.idx + 1 != cd.idx:
+ print "WARNING: %r:" % dbg, \
+ "column left of '%s' should not be '%s'" % (txt, k_left)
+ self.hor_pos[k_left][1] = left = self.adjust(left, cd,
+ near_left, cd_left)
+ k_right = self.find_k_right(right)
+ if k_right:
+ near_right = self.hor_pos[k_right][0]
+ cd_right = self.Columns[k_right]
+ if cd_right.idx - 1 != cd.idx:
+ print "WARNING: %r:" % dbg, \
+ "column right of '%s' should not be '%s'" % (txt, k_right)
+ self.hor_pos[k_right][0] = right = self.adjust(right, cd,
+ near_right, cd_right)
+
+ other_k = self.overlap(txt, left, right)
+ if other_k:
+ raise ValueError("column %r (%d, %d) overlaps with %r (%d, %d)"
+ % (txt, left, right, other_k,
+ self.hor_pos[other_k][0],
+ self.hor_pos[other_k][1]))
+
+ self.hor_pos[txt] = [left, right]
+
+ return True
+
+ @staticmethod
+ def matching_level(l, r, l_ref, r_ref):
+ l_com = max(l, l_ref)
+ r_com = min(r, r_ref)
+ if l_com >= r_com:
+ return 0.0
+ else:
+ return (r_com - l_com) / float(r - l)
+
+ def search(self, left, right, min_match_level, out_level=None):
+ for k, (l_col, r_col) in self.hor_pos.iteritems():
+ level = self.matching_level(left, right, l_col, r_col)
+ if level >= min_match_level:
+ if out_level is not None:
+ out_level[0] = level
+ return self.Columns[k].idx
+
+
+class Cell(object):
+ ADJ_X = 5
+ def __init__(self, column, col_name):
+ self.paragraphs = []
+ self.last_te = None
+ self.column = column
+ self.col_name = col_name
+ def is_empty(self):
+ return self.paragraphs == []
+ def insert_te(self, te):
+ txt = asciify(te.text)
+ if self.last_te \
+ and (te.top <= self.last_te.top + self.last_te.height):
+ self.paragraphs[-1] += txt
+ else:
+ self.paragraphs.append(txt)
+ self.last_te = te
+ def analyse(self):
+ self.analysis_warning = False
+ # post-analyse
+ def simple_lines(self):
+ res = [self.col_name]
+ res.extend([("\t" + p) for p in self.paragraphs])
+ return res
+
+
+class CellSymbol(Cell):
+ WARN_EMPTY = True
+ def analyse(self):
+ if self.is_empty():
+ self.content = ''
+ self.analysis_warning = self.WARN_EMPTY
+ else:
+ self.content = self.paragraphs[0].strip()
+ self.analysis_warning = (len(self.paragraphs) > 1)
+ def warn(self):
+ return "\t\t<warning>" if self.analysis_warning else ""
+ def simple_lines(self):
+ return ["%-13s %s%s" % (self.col_name, self.content, self.warn())]
+
+
+class CellRange(CellSymbol):
+ def _parse(self):
+ m = re.match(r"(\d+)[- :]+(\d+)", self.content)
+ if m:
+ self.range = (int(m.group(1)), int(m.group(2)))
+ return
+ try:
+ single_bit = int(self.content)
+ self.range = (single_bit, single_bit)
+ return
+ except ValueError:
+ pass
+ def analyse(self):
+ CellSymbol.analyse(self)
+ self.range = None
+ self._parse()
+ if self.range is None:
+ self.analysis_warning = True
+ def bits_width(self):
+ if self.range is None:
+ return None
+ return self.range[0] + 1 - self.range[1]
+ def str_range(self):
+ if self.range is None:
+ return 'None'
+ elif self.range[0] == self.range[1]:
+ return "%d" % self.range[0]
+ else:
+ return "%d-%d" % self.range
+ def simple_lines(self):
+ return ["%-13s %s%s" % (self.col_name, self.str_range(), self.warn())]
+
+
+class CellConstraint(CellSymbol):
+ # abstract class, derivate and add e.g.
+ # ALLOWED = ('RO', 'RW')
+ def analyse(self):
+ CellSymbol.analyse(self)
+ if self.content not in self.ALLOWED:
+ self.analysis_warning = True
+
+
+class CellSticky(CellConstraint):
+ WARN_EMPTY = False # for upper layers
+ ALLOWED = ('Y', 'N', '')
+
+
+class CellAccess(CellConstraint):
+ ALLOWED = ('RW', 'RO', 'RWS', # <= enough for smrbase
+ 'RWC', 'RWO', 'RW0C', # + imch_conf
+ 'RWL', # + imch_conf
+ 'RV', 'RCWC', 'WO', 'RC', # + gbe
+ 'RO/RWC') # + gbe
+
+
+def to_bin(i):
+ if i == 0:
+ return '0'
+ r = ""
+ while i:
+ r = str(i & 1) + r
+ i /= 2
+ return r
+
+class CellReset(CellSymbol):
+ def analyse(self):
+ CellSymbol.analyse(self)
+ self.value = None
+ self.base = None
+ self.bits = None
+ if self.content.endswith('b'):
+ self.base = 2
+ elif self.content.endswith('h'):
+ self.base = 16
+ if self.base:
+ try:
+ self.value = int(self.content[:-1], self.base)
+ except ValueError:
+ pass
+ if self.content in ('0', '1'):
+ self.value = int(self.content)
+ self.base = 2
+ if self.value is None:
+ self.analysis_warning = True
+ # post-analyse
+ def set_bits(self, bits):
+ self.bits = bits
+ def bin_repr(self):
+ if self.value is None:
+ return 'None'
+ bin = to_bin(self.value)
+ if self.bits is None:
+ return bin + 'b'
+ else:
+ return ('0' * (self.bits - len(bin))) + bin + 'b'
+ def hex_repr(self):
+ if self.value is None:
+ return 'None'
+ if self.bits is None:
+ return "%xh" % self.value
+ else:
+ nibs = (self.bits + 3) / 4
+ return ("%%0%dxh" % nibs) % self.value
+ def simple_repr(self):
+ if self.value is None:
+ return 'None'
+ bh = (self.bin_repr(), self.hex_repr())
+ if self.base == 2:
+ return "[ %s ]\t %s " % bh
+ else:
+ return " %s \t[ %s ]" % bh
+ def simple_lines(self):
+ return ["%-11s %s%s" % (self.col_name, self.simple_repr(), self.warn())]
+
+
+ # Line PageTE
+INSERTED = State('INSERTED') # X X
+NEW_LINE = State('NEW_LINE') # X
+INVALID_LINE = State('INVALID_LINE') # X
+NOT_HANDLED = State('NOT_HANDLED') # X
+END_OF_PAGE = State('END_OF_PAGE') # X
+
+VALID_TE_RANGE = re.compile(r'[-0-9 :]').match
+
+class Line(object):
+ def __init__(self, top):
+ # direct symbolic cell access
+ # prefixed by c_ cause i might also use unprefixed attrs in the
+ # future and directly put payloads there.
+ self.c_range = CellRange (COL_RANGE, 'range')
+ self.c_acronym = CellSymbol (COL_ACRONYM, 'acronym')
+ self.c_description = Cell (COL_DESCRIPTION, 'description')
+ self.c_sticky = CellSticky (COL_STICKY, 'sticky')
+ self.c_reset = CellReset (COL_RESET, 'reset')
+ self.c_access = CellAccess (COL_ACCESS, 'access')
+ # access by columns index:
+ self.cells = [
+ self.c_range,
+ self.c_acronym,
+ self.c_description,
+ self.c_sticky,
+ self.c_reset,
+ self.c_access,
+ ]
+ self.top = top
+ self.bottom = None
+ self.bits = None
+ def is_empty(self):
+ for c in self.cells:
+ if not c.is_empty():
+ return False
+ return True
+ def try_insert_te(self, column, te):
+ if (self.bottom is None) or (te.top < self.bottom):
+ # NOTE: this only works if the range cell contains only one line
+ if (self.bottom is None) and (column == COL_RANGE):
+ if not VALID_TE_RANGE(te.text):
+ return INVALID_LINE
+ self.bottom = 2 * te.top - self.top
+ if (self.bottom is not None) \
+ and (te.top + te.height - 1 > self.bottom):
+ self.bottom = te.top + te.height - 1
+ self.cells[column].insert_te(te)
+ return INSERTED
+ else:
+ return NEW_LINE
+ def simple_print(self):
+ for c in self.cells:
+ for l in c.simple_lines():
+ print l
+ def value_print(self, value):
+ cell_value = copy(self.c_reset)
+ bit_base = self.c_range.range[1]
+ bit_mask = (1 << cell_value.bits) - 1
+ cell_value.value = (value >> bit_base) & bit_mask
+ cell_value.col_name = "value"
+ for c in self.cells[:2] + [cell_value] + self.cells[2:]:
+ for l in c.simple_lines():
+ print l
+ def analyse_line(self):
+ for c in self.cells:
+ c.analyse()
+ self.bits = self.c_range.bits_width()
+ self.c_reset.set_bits(self.bits)
+
+
+def strip_strict_ascii(s):
+ return s.strip().encode('ascii', 'strict')
+
+def parse_offset(s):
+ m = re.search(r"([0-9a-f]+)h(?:\s+at\s+([0-9a-f]+)h)? *$", s, re.I)
+ if m:
+ base, inc = m.groups()
+ base = int(base, 16)
+ if inc:
+ inc = int(inc, 16)
+ return base, inc
+ else:
+ raise ValueError("can't parse offset %r" % s)
+
+def format_offset(off):
+ if off is None:
+ return "-"
+ if isinstance(off, basestring):
+ return off
+ else:
+ return "%02Xh" % off
+
+def parse_size(s):
+ m = re.match(r"(\d+)\s+bit", s, re.I)
+ if m:
+ return int(m.group(1))
+ else:
+ raise ValueError("can't parse size %r" % s)
+
+def parse_default(s):
+ s = s.strip()
+ if not s.endswith('h'):
+ # raise ValueError("can't parse default value %r" % s)
+ return s
+ try:
+ return int(s[:-1], 16)
+ except ValueError:
+ if "X" in s: # gruik hack
+ return s
+ raise
+
+def fmt_identity(t, x):
+ return x
+
+def fmt_hex(t, x):
+ return "%xh" % x
+
+def fmt_offset_start_end(t, x):
+ if x[1] is None:
+ return "%xh" % x[0]
+ else:
+ return "%xh at %xh" % x
+
+def fmt_default(t, x):
+ if isinstance(x, basestring):
+ return x
+ nibs = (t.size + 3) / 4
+ return ("%%0%dxh" % nibs) % x
+
+
+FD = namedtuple('FD', 'attr_name adj_y adj_x parser')
+
+class Table(object):
+
+ # WARNING:
+ # offset_end often contains random information in recurring registers
+
+ # attr_name adj_y adj_x parser
+ Fields = { 'Description:': FD('description', 2, 5, strip_strict_ascii),
+ 'View:': FD('view', 2, 5, strip_strict_ascii),
+ 'BAR:': FD('bar', 2, 5, strip_strict_ascii),
+ 'Offset Start:': FD('offset_start', 2, 5, parse_offset),
+ 'Offset End:': FD('offset_end', 2, 5, parse_offset),
+ 'Power Well:': FD('power_well', 2, 5, strip_strict_ascii),
+ 'Size:': FD('size', 2, 5, parse_size),
+ 'Default:': FD('default', 2, 5, parse_default),
+ 'Bus:Device:Function:': FD('bus_device_function', 2, 5, strip_strict_ascii) }
+
+ HeaderAttr = [
+ # from table title:
+ ('table_ref', fmt_identity),
+ ('offset', fmt_hex),
+ ('reg_name', fmt_identity),
+ ('recurring', fmt_identity),
+ ('reg_base_name', fmt_identity),
+ ('title_desc', fmt_identity),
+
+ # from table header:
+ ('description', fmt_identity),
+ ('view', fmt_identity),
+ ('bar', fmt_identity),
+ ('offset_start', fmt_offset_start_end),
+ ('offset_end', fmt_offset_start_end),
+ ('power_well', fmt_identity),
+ ('size', fmt_identity),
+ ('default', fmt_default),
+ ('bus_device_function', fmt_identity),
+ ]
+ HeaderAttrDict = dict(HeaderAttr)
+
+ START_LINE_MAX_HEIGHT = 11 # max interval _between_ title lines
+ COL_HEADER_HEIGHT = 26
+ def __init__(self, chapter, table_num, offset, reg_name, title_desc):
+ self.chapter = chapter
+ self.table_num = table_num
+ self.offset = offset
+ self.reg_name = reg_name
+ self.title_desc = title_desc
+ self.table_ref = "%d-%d" % (self.chapter, self.table_num)
+ #
+ self.recurring, self.reg_base_name = self.parse_reg_name(reg_name)
+ #
+ self.description = None
+ self.view = None
+ self.bar = None
+ self.offset_start = None
+ self.offset_end = None
+ self.power_well = None
+ self.size = None
+ self.default = None
+ self.bus_device_function = None
+ #
+ self.columns = None
+ self.cur_line = None
+ self.lines = []
+ @staticmethod
+ def parse_reg_name(reg_name):
+ m = re.match(r"(.*?)[[](\d+)[-:](\d+)[]]", reg_name)
+ if not m:
+ return None, reg_name
+ reg_base_name, left, right = m.groups()
+ left = int(left)
+ right = int(right)
+ low = min(left, right)
+ high = max(left, right)
+ if low != 0:
+ print ("WARNING: don't know how to handle recurring register %s "
+ "not starting at index 0. Loading as non recurring for now."
+ % reg_name)
+ return None, reg_base_name
+ return high + 1, reg_base_name
+ def attrs_set_abstract(self):
+ del self.columns
+ del self.cur_line
+ def clone_abstract(self):
+ """
+ Clone the abstract part of this table, that is everything needed for
+ extraction of payload informations.
+ result.columns and result.cur_line are deleted.
+ self.lines is shallow copied. => result.lines = self.lines[:]
+ """
+ result = copy(self)
+ result.attrs_set_abstract()
+ result.lines = self.lines[:]
+ return result
+ def extend_abstract(self, other):
+ """
+ Once a first 'concrete' table has been cloned to an abstract version,
+ it can be extended with other parts of the same table (from subsequent
+ pages). Example if a table is splitted between (end of) page p_1 and
+ (start of page) p_2:
+ >>> pte_1 = PageTE(p_1, profile)
+ >>> pte_2 = PageTE(p_2, profile)
+ >>> full_table = pte_1.tables[-1].clone_abstract()
+ >>> full_table.extend_abstract(pte_2.tables[0])
+ """
+ self.lines.extend(other.lines)
+ def header_mismatch(self, other):
+ return [
+ (attr, getattr(self, attr), getattr(other, attr))
+ for (attr, _) in self.HeaderAttr
+ if getattr(self, attr) != getattr(other, attr)
+ ]
+ def use_columns_pos(self):
+ self.columns = ColumnsPos()
+ def cat_start_line(self, text):
+ self.title_desc += text.encode('ascii', 'strict')
+ def set_field_parse(self, fd, s):
+ try:
+ setattr(self, fd.attr_name, fd.parser(s))
+ except:
+ print "exception while parsing %r" % vars(self)
+ raise
+ def close(self):
+ self.title_desc = re.sub(r'\s*\(Sheet .* of .*\)\s*', ' ',
+ self.title_desc).strip()
+ self.cur_line = None
+ def start_content(self):
+ self.cur_line = Line(self.columns.ver_pos + self.COL_HEADER_HEIGHT / 2)
+ self.lines.append(self.cur_line)
+ def insert_content_te(self, column, te):
+ res = self.cur_line.try_insert_te(column, te)
+ if res == INSERTED:
+ return True
+ elif res == NEW_LINE:
+ if self.cur_line.bottom is None:
+ next_top = te.top + 1 - te.height
+ else:
+ next_top = self.cur_line.bottom
+ self.cur_line = Line(next_top)
+ self.lines.append(self.cur_line)
+ self.cur_line.try_insert_te(column, te)
+ return True
+ else: # INVALID_LINE
+ self.lines = self.lines[:-1]
+ if self.lines:
+ self.cur_line = self.lines[-1]
+ else:
+ self.cur_line = None
+ return False
+ def analyse_content(self):
+ for l in self.lines:
+ l.analyse_line()
+ def check_ranges(self):
+ rembits = self.size
+ for l in self.lines:
+ if (l.c_range.range is None) or (rembits != l.c_range.range[0] + 1):
+ return False
+ rembits = l.c_range.range[1]
+ return rembits == 0
+ def auto_offset(self):
+ if self.offset is None:
+ self.offset = self.offset_start[0]
+ def fmt(self, field_name):
+ fmt_func = self.HeaderAttrDict[field_name]
+ return fmt_func(self, getattr(self, field_name))
+ def value_print(self, value):
+ print '============================================================'
+ print '%xh' % self.offset, self.reg_name
+ for h, fmt in self.HeaderAttr:
+ print " %-15s %s" % (h, fmt(self, getattr(self, h)))
+ print
+ for l in self.lines:
+ print " ----------------"
+ l.value_print(value)
+ print
+ print
+
+
+class ProfileBase(object):
+ """
+ To properly parse register tables from the informally written 320066
+ datasheet, it is typically needed to perform some adjustments during
+ the dispatching of text elements, and/or after the parse stage itself
+ on abstracted tables. Also some tables need to be blacklisted, and the
+ page range to be defined.
+
+ Derive this class to implement such a register set profile,
+ then register it by calling register_profile(derived_class)
+ """
+
+ name = None # string - symbolic name of the register set
+ min_page = None # int - first page to parse (included)
+ max_page = None # int - last page to parse (included)
+
+ @staticmethod
+ def special_te_mapper(ps, te, table):
+ """
+ This method can be used to help implement the .special_replace() method.
+ ps: Parse State - instance of PageTE.ParseState
+ te: Text Element - instance of SomeText
+ table: replacement (hash) table, ex:
+ {
+ (432, TE_Id(text=u'N 00001b', top=384, left=409, width=60, height=8)):
+ [TE_Id(text=u'N', top=384, left=409, width=4, height=8),
+ TE_Id(text=u'00001b', top=384, left=427, width=50, height=8)],
+ }
+ The key is (page_num, te_id) where te_id describes the original text
+ element which could not be fitted in a table cell.
+ The value is a list of TE_Id, to replace the offending one.
+ """
+ key = (ps.pte.page_num, te.identity())
+ repl_ids = table.get(key)
+ if repl_ids is not None:
+ return [SomeText.special_from_id(p) for p in repl_ids]
+
+ def special_replace(self, ps, te):
+ return None
+
+ def post_fix(self, a_t):
+ pass
+
+ def table_blacklisted(self, chapter, table_num):
+ """
+ chapter: int
+ table_num: int
+ """
+ return False
+
+
+_profiles = {}
+
+def register_profile(profile_class):
+ if not isinstance(profile_class.name, basestring):
+ raise ValueError("profile_class should be a string")
+ _profiles[profile_class.name] = profile_class
+
+
+def profile_factory(profile_name):
+ if profile_name not in _profiles:
+ raise ValueError("profile %s does not exist" % profile_name)
+ profile_class = _profiles[profile_name]
+ return profile_class()
+
+
+# Unhandled Description
+UD = namedtuple('UD', 'te table_ref reg_name')
+
+ParsedTitle = namedtuple('ParsedTitle',
+ 'chapter table_num offset reg_name title_desc')
+
+class PageTE(object):
+
+ class ParseState(object):
+ """
+ This contains attributes that are shared between PageTE methods
+ during the parse stage, but really conceptually are local vars.
+ I put them all here because it's easier to get ride of them all
+ like that, and also less error prone. Plus they are clearly
+ identified, so user code won't try to do stupid things and hopefully
+ the mess in PageTE will be kept limited.
+ """
+ NOP = State('NOP')
+ TITLE = State('TITLE')
+ HEADER = State('HEADER')
+ COL_HEADER = State('COL_HEADER')
+ CONTENT = State('CONTENT')
+ def __init__(self, pte, et_page, profile):
+ # tels: text elements
+ # curt: current table
+ # st: state (one of the above)
+ #
+ self.unhandled = {# no NOP
+ self.TITLE: [],
+ self.HEADER: [],
+ self.COL_HEADER: [],
+ self.CONTENT: []}
+ self.pte = pte
+ self.tels = [SomeText.from_et(elem)
+ for elem in et_page.findall('text')]
+ self.tels.sort(key=lambda elem: (elem.top, elem.left))
+ self.tels_X = [[] for i in xrange(self.pte.page_width)]
+ self.tels_Y = [[] for i in xrange(self.pte.page_height)]
+ for te in self.tels:
+ self.tels_X[te.left].append(te)
+ self.tels_Y[te.top].append(te)
+ self.curt = None
+ self.st = self.NOP
+ self.title_y = None
+ self.profile = profile
+ def close_curt(self):
+ if self.curt:
+ self.curt.close()
+ self.curt = None
+ self.st = self.NOP
+ def find_te_around(self, y, x, ay, ax):
+ y_min = max(0, y - ay)
+ y_max = min(self.pte.page_height, y + ay + 1)
+ x_min = max(0, x - ax)
+ x_max = min(self.pte.page_width, x + ax + 1)
+ for tel_line in self.tels_Y[y_min:y_max]:
+ for te in tel_line:
+ if x_min <= te.left < x_max:
+ return te
+
+ WARN_EMPTY = False
+
+ MIN_MATCH_COL = 0.9
+
+ # any text under this point marks and end of page
+ PAGE_BOTTOM = 732 # gruik hack
+ TABLE_CHAR_MAX_SIZE = 10 # quite gruik too
+
+ start_search = re.compile(ur"""
+ Table\s+(\d+)-(\d+)\s*\.\s* # <chapter>-<table_num>
+ (?:Offset\s+([0-9a-f]+)h\s*:\s*)? # <offset>
+ (\S+?(?:[[]\S+?[]])?)(?:\s?[-\u2013:])+ # <reg_name>
+ (.*)$ # <title_desc>
+ """, re.I | re.VERBOSE).search
+
+ def parse_table_title(self, ps, text):
+ m = self.start_search(text)
+ if not m:
+ return None
+ chapter = int(m.group(1))
+ table_num = int(m.group(2))
+ offset_group = m.group(3)
+ offset = (offset_group and int(offset_group, 16))
+ if ps.profile.table_blacklisted(chapter, table_num):
+ return None
+ return ParsedTitle(
+ chapter=chapter,
+ table_num=table_num,
+ offset=offset,
+ reg_name=m.group(4).encode('ascii', 'strict'),
+ title_desc=m.group(5).encode('ascii', 'strict'))
+
+ @staticmethod
+ def PteTable(parsed_title):
+ pt = Table(parsed_title.chapter,
+ parsed_title.table_num,
+ parsed_title.offset,
+ parsed_title.reg_name,
+ parsed_title.title_desc)
+ pt.use_columns_pos()
+ return pt
+
+ # note: TEs are usually sorted by (y, x), but not strictly
+ # in case of a special case. Don't do insane things.
+ def _process_te(self, ps, te):
+ if te.top >= self.PAGE_BOTTOM:
+ return END_OF_PAGE
+ parsed_title = self.parse_table_title(ps, te.text)
+ if parsed_title:
+ ps.close_curt()
+ ps.curt = self.PteTable(parsed_title)
+ self.tables.append(ps.curt)
+ ps.title_y = te.top
+ ps.st = ps.TITLE
+ return INSERTED
+ if (ps.st in (ps.TITLE, ps.HEADER)) and (te.text in Table.Fields):
+ ps.st = ps.HEADER
+ fd = Table.Fields[te.text]
+ ote = ps.find_te_around(te.top,
+ te.left + te.width,
+ fd.adj_y, fd.adj_x)
+ if ote:
+ ote.handled = True
+ ps.curt.set_field_parse(fd, ote.text)
+ else:
+ if self.WARN_EMPTY:
+ print ("WARNING: page %d: empty field %r in "
+ "Table %s Offset %s: %s - %s") % \
+ (self.page_num, te.text,
+ ps.curt.table_ref, format_offset(ps.curt.offset),
+ ps.curt.reg_name, ps.curt.title_desc)
+ return INSERTED
+ if ps.st == ps.TITLE:
+ if te.top < ps.title_y + ps.curt.START_LINE_MAX_HEIGHT:
+ ps.curt.cat_start_line(te.text)
+ ps.title_y = te.top
+ return INSERTED
+ # else nothing
+ if ps.st in (ps.HEADER, ps.COL_HEADER):
+ r = ps.curt.columns.try_column_header(te,
+ # info for error traces:
+ {'page': self.page_num,
+ 'table': ps.curt.table_ref})
+ if r:
+ ps.st = ps.COL_HEADER
+ return INSERTED
+ elif ps.st == ps.COL_HEADER:
+ ps.curt.start_content()
+ ps.st = ps.CONTENT
+ if DBG:
+ print "RAW", ps.curt.table_ref, ps.curt.reg_name, ps.curt.columns.ver_pos, dbg_fmt_hor_pos(ps.curt.columns.raw_hor_pos)
+ print "POS", ps.curt.table_ref, ps.curt.reg_name, ps.curt.columns.ver_pos, dbg_fmt_hor_pos(ps.curt.columns.hor_pos)
+ if ps.st == ps.CONTENT:
+ if te.height > self.TABLE_CHAR_MAX_SIZE:
+ ps.close_curt()
+ return INSERTED
+ lvl = [None]
+ column = ps.curt.columns.search(te.left, te.left + te.width,
+ self.MIN_MATCH_COL, lvl)
+ if DBG:
+ print "III", ps.curt.table_ref, ps.curt.reg_name, lvl[0], te
+ if column is not None:
+ # XXX log dismissed one that do not close the table?
+ # need to add a special return value to insert_content_te
+ # to do that
+ if not ps.curt.insert_content_te(column, te):
+ ps.close_curt()
+ return INSERTED
+ elif not te.special:
+ spe_tels = self.try_special(ps, te)
+ if spe_tels is not None:
+ if not spe_tels:
+ return INSERTED
+ result = NOT_HANDLED
+ for spe_te in spe_tels:
+ spe_r = self.process_te(ps, spe_te)
+ if spe_r == END_OF_PAGE:
+ return END_OF_PAGE
+ elif spe_r == INSERTED:
+ result = INSERTED
+ if result == INSERTED:
+ return INSERTED
+ if ps.st in ps.unhandled:
+ ps.unhandled[ps.st].append(UD(te, ps.curt.table_ref,
+ ps.curt.reg_name))
+ return NOT_HANDLED
+
+ def process_te(self, ps, te):
+ """
+ factors out handling of te.handled from _process_te
+ """
+ if te.handled:
+ return INSERTED
+ r = self._process_te(ps, te)
+ if r != NOT_HANDLED:
+ te.handled = True
+ return r
+
+ def try_special(self, ps, te):
+ """
+ Returns None if this te is not to be handled as a special case.
+ Else returns a list of te for which insertion should be tried.
+ IMPORTANT:
+ It is important that each of these te have its 'special' attribute
+ set to True so that infinite recursion is reliably avoided.
+ (special te won't result in a call to try_special if they are
+ not handled)
+ """
+ return ps.profile.special_replace(ps, te)
+
+ def __init__(self, et_page, profile):
+ self.page_num = int(et_page.attrib['number'])
+ if DBG:
+ print "PAGE", self.page_num
+
+ self.page_height = int(et_page.attrib['height'])
+ self.page_width = int(et_page.attrib['width'])
+
+ self.tables = []
+
+ ps = self.ParseState(self, et_page, profile)
+
+ for te in ps.tels:
+ if self.process_te(ps, te) == END_OF_PAGE:
+ break
+ ps.close_curt()
+
+ for state, unh_tels in ps.unhandled.iteritems():
+ for ud in unh_tels:
+ if not ud.te.handled:
+ print "WWW unhandled %r ### %r" % \
+ ((self.page_num, ud.te.identity()),
+ (ud.table_ref, ud.reg_name, state))
+
+
+class AbsTables(object):
+ def __init__(self):
+ self.tables = []
+ self.by_ref = {}
+ self.by_base_name = {}
+ def append_table(self, t):
+ if t.table_ref not in self.by_ref:
+ if t.reg_base_name in self.by_base_name:
+ raise ValueError("already have a table (ref: %s) "
+ "for register %s" %
+ (self.by_base_name[t.reg_base_name].table_ref,
+ t.reg_base_name))
+ new_abs_t = t.clone_abstract()
+ self.tables.append(new_abs_t)
+ self.by_ref[t.table_ref] = new_abs_t
+ self.by_base_name[t.reg_base_name] = new_abs_t
+ else:
+ ml = self.by_ref[t.table_ref].header_mismatch(t)
+ if ml:
+ print "MISMATCH %s:" % t.table_ref, ml
+ self.by_ref[t.table_ref].extend_abstract(t)
+ def find_by_base_name(self, base_name):
+ return self.by_base_name.get(base_name)
+ def find_by_offset(self, offset):
+ for t in self.tables:
+ if offset == t.offset:
+ return t
+ if t.recurring and (offset > t.offset):
+ delta = offset - t.offset
+ if (delta % t.offset_start[1]) == 0 \
+ and (delta / t.offset_start[1]) < t.recurring:
+ return t
+ # for use in post_fix
+ def auto_tables_offset(self):
+ for t in self.tables:
+ t.auto_offset()
+
+
+def abs_tables_from_pages(pages, profile):
+ """
+ Simple wrapper to concatenate and analyse tables from loaded pages.
+ """
+ a_t = AbsTables()
+ for p in pages:
+ pte = PageTE(p, profile)
+ for t in pte.tables:
+ a_t.append_table(t)
+ for t in a_t.tables:
+ t.analyse_content()
+ profile.post_fix(a_t)
+ return a_t