diff options
Diffstat (limited to 'parse_datasheet/datasheet_tables.py')
-rw-r--r-- | parse_datasheet/datasheet_tables.py | 1035 |
1 files changed, 1035 insertions, 0 deletions
diff --git a/parse_datasheet/datasheet_tables.py b/parse_datasheet/datasheet_tables.py new file mode 100644 index 0000000..778783a --- /dev/null +++ b/parse_datasheet/datasheet_tables.py @@ -0,0 +1,1035 @@ +#!/usr/bin/python + +import xml.etree.cElementTree as ET +from operator import attrgetter, itemgetter +import re +from collections import namedtuple +from copy import copy + +DBG = 0 + +def filter_pages(doc, min_page, max_page): + return [page + for page in doc.getroot().getchildren() + if min_page <= int(page.attrib['number']) <= max_page] + + +def load_datasheet_pages(filepath, min_page, max_page): + doc = ET.parse(filepath) + return filter_pages(doc, min_page, max_page) + + +ASCIIFY_TABLE = { + u'\u201c': '"', + u'\u201d': '"', + u'\u2019': "'", + u'\u2018': "'", + u'\u2013': "-", +} + +def asciify(us): + result = [ASCIIFY_TABLE.get(a, a).encode('ascii', 'replace') for a in us] + return ''.join(result) + + +class State(object): + __slots__ = ['name'] + def __init__(self, name): + self.name = name + def __repr__(self): + return '<%s>' % self.name + + +def _get_all_text_rec(elem, parts): + if elem.text: + parts.append(elem.text) + for c in elem.getchildren(): + _get_all_text_rec(c, parts) + if c.tail: + parts.append(c.tail) + return parts + +def get_all_text(elem): + return u''.join(_get_all_text_rec(elem, [])) + + +TE_Id = namedtuple('TE_Id', 'text top left width height') + +class SomeText(object): + __slots__ = ['text', 'top', 'left', 'width', 'height', 'handled', 'special'] + def __init__(self, text, top, left, width, height, special): + self.text = text + self.top = top + self.left = left + self.width = width + self.height = height + self.special = special + self.handled = False + @classmethod + def from_et(cls, elem): + return cls(get_all_text(elem), + int(elem.attrib['top']), + int(elem.attrib['left']), + int(elem.attrib['width']), + int(elem.attrib['height']), + special=False) + @classmethod + def special_from_id(cls, i): + return cls(i.text, i.top, i.left, i.width, i.height, special=True) + def identity(self): + return TE_Id(self.text, self.top, self.left, self.width, self.height) + def __repr__(self): + return '<SomeText top=%r left=%r width=%r height=%r text=%r%s>' % \ + (self.top, self.left, self.width, self.height, self.text, + " special" if self.special else "") + + +def dbg_fmt_hor_pos(hp): + return sorted(hp.items(), key=itemgetter(1)) + +# In left to right order: +# WARNING: this should not ever need to change, but if it does after all +# you should carefully review this whole file... +COL_RANGE = 0 +COL_ACRONYM = 1 +COL_DESCRIPTION = 2 +COL_STICKY = 3 +COL_RESET = 4 +COL_ACCESS = 5 +COL_NUMBER = 6 + +CD = namedtuple('CD', 'width idx') + +class ColumnsPos(object): + + Columns = { + 'bit range': CD(60, COL_RANGE), + 'bit acronym': CD(58, COL_ACRONYM), + 'bit description': CD(204, COL_DESCRIPTION), + 'sticky': CD(28, COL_STICKY), + 'bit reset': CD(58, COL_RESET), + 'bit access': CD(51, COL_ACCESS), + } + + ADJUST = 10 + + def __init__(self): + self.hor_pos = dict([(k, None) for k in self.Columns.iterkeys()]) + self.ver_pos = None + # DBG: + self.raw_hor_pos = dict(self.hor_pos) + + def overlap(self, txt, left, right): + for k, a in self.hor_pos.iteritems(): + if a and (left < a[1]) and not (right <= a[0]): + return k + + def find_k_left(self, left): + for k, a in self.hor_pos.iteritems(): + if a and (left - self.ADJUST <= a[1] <= left + self.ADJUST): + return k + + def find_k_right(self, right): + for k, a in self.hor_pos.iteritems(): + if a and (right - self.ADJUST <= a[0] <= right + self.ADJUST): + return k + + def adjust(self, pos_a, cd_a, pos_b, cd_b): + w = cd_a.width + cd_b.width + # the longest can be adjusted more: + return ((pos_a * cd_b.width) # move less if the other is longer + + (pos_b * cd_a.width) + + w/2) / w + + def try_column_header(self, te, dbg=""): + txt = te.text.strip().lower() + if txt == 'bit range': + self.ver_pos = te.top + cd = self.Columns.get(txt) + if (not cd) and txt == 'value': + cd = self.Columns['bit reset'] + if not cd: + return False + + left = te.left + te.width / 2 - cd.width / 2 + right = left + cd.width + + if txt == 'value': + o_l, o_r = self.hor_pos['bit reset'] + if not (o_l - self.ADJUST <= left <= o_l + self.ADJUST) \ + or not (o_r - self.ADJUST <= right <= o_r + self.ADJUST): + print "WARNING: %r:" % dbg, \ + "'value' (%d, %d) not haligned with 'bit reset'" \ + % (left, right), \ + "### %r" % vars(self) + return True + + # DBG: + self.raw_hor_pos[txt] = [left, right] + + k_left = self.find_k_left(left) + if k_left: + near_left = self.hor_pos[k_left][1] + cd_left = self.Columns[k_left] + if cd_left.idx + 1 != cd.idx: + print "WARNING: %r:" % dbg, \ + "column left of '%s' should not be '%s'" % (txt, k_left) + self.hor_pos[k_left][1] = left = self.adjust(left, cd, + near_left, cd_left) + k_right = self.find_k_right(right) + if k_right: + near_right = self.hor_pos[k_right][0] + cd_right = self.Columns[k_right] + if cd_right.idx - 1 != cd.idx: + print "WARNING: %r:" % dbg, \ + "column right of '%s' should not be '%s'" % (txt, k_right) + self.hor_pos[k_right][0] = right = self.adjust(right, cd, + near_right, cd_right) + + other_k = self.overlap(txt, left, right) + if other_k: + raise ValueError("column %r (%d, %d) overlaps with %r (%d, %d)" + % (txt, left, right, other_k, + self.hor_pos[other_k][0], + self.hor_pos[other_k][1])) + + self.hor_pos[txt] = [left, right] + + return True + + @staticmethod + def matching_level(l, r, l_ref, r_ref): + l_com = max(l, l_ref) + r_com = min(r, r_ref) + if l_com >= r_com: + return 0.0 + else: + return (r_com - l_com) / float(r - l) + + def search(self, left, right, min_match_level, out_level=None): + for k, (l_col, r_col) in self.hor_pos.iteritems(): + level = self.matching_level(left, right, l_col, r_col) + if level >= min_match_level: + if out_level is not None: + out_level[0] = level + return self.Columns[k].idx + + +class Cell(object): + ADJ_X = 5 + def __init__(self, column, col_name): + self.paragraphs = [] + self.last_te = None + self.column = column + self.col_name = col_name + def is_empty(self): + return self.paragraphs == [] + def insert_te(self, te): + txt = asciify(te.text) + if self.last_te \ + and (te.top <= self.last_te.top + self.last_te.height): + self.paragraphs[-1] += txt + else: + self.paragraphs.append(txt) + self.last_te = te + def analyse(self): + self.analysis_warning = False + # post-analyse + def simple_lines(self): + res = [self.col_name] + res.extend([("\t" + p) for p in self.paragraphs]) + return res + + +class CellSymbol(Cell): + WARN_EMPTY = True + def analyse(self): + if self.is_empty(): + self.content = '' + self.analysis_warning = self.WARN_EMPTY + else: + self.content = self.paragraphs[0].strip() + self.analysis_warning = (len(self.paragraphs) > 1) + def warn(self): + return "\t\t<warning>" if self.analysis_warning else "" + def simple_lines(self): + return ["%-13s %s%s" % (self.col_name, self.content, self.warn())] + + +class CellRange(CellSymbol): + def _parse(self): + m = re.match(r"(\d+)[- :]+(\d+)", self.content) + if m: + self.range = (int(m.group(1)), int(m.group(2))) + return + try: + single_bit = int(self.content) + self.range = (single_bit, single_bit) + return + except ValueError: + pass + def analyse(self): + CellSymbol.analyse(self) + self.range = None + self._parse() + if self.range is None: + self.analysis_warning = True + def bits_width(self): + if self.range is None: + return None + return self.range[0] + 1 - self.range[1] + def str_range(self): + if self.range is None: + return 'None' + elif self.range[0] == self.range[1]: + return "%d" % self.range[0] + else: + return "%d-%d" % self.range + def simple_lines(self): + return ["%-13s %s%s" % (self.col_name, self.str_range(), self.warn())] + + +class CellConstraint(CellSymbol): + # abstract class, derivate and add e.g. + # ALLOWED = ('RO', 'RW') + def analyse(self): + CellSymbol.analyse(self) + if self.content not in self.ALLOWED: + self.analysis_warning = True + + +class CellSticky(CellConstraint): + WARN_EMPTY = False # for upper layers + ALLOWED = ('Y', 'N', '') + + +class CellAccess(CellConstraint): + ALLOWED = ('RW', 'RO', 'RWS', # <= enough for smrbase + 'RWC', 'RWO', 'RW0C', # + imch_conf + 'RWL', # + imch_conf + 'RV', 'RCWC', 'WO', 'RC', # + gbe + 'RO/RWC') # + gbe + + +def to_bin(i): + if i == 0: + return '0' + r = "" + while i: + r = str(i & 1) + r + i /= 2 + return r + +class CellReset(CellSymbol): + def analyse(self): + CellSymbol.analyse(self) + self.value = None + self.base = None + self.bits = None + if self.content.endswith('b'): + self.base = 2 + elif self.content.endswith('h'): + self.base = 16 + if self.base: + try: + self.value = int(self.content[:-1], self.base) + except ValueError: + pass + if self.content in ('0', '1'): + self.value = int(self.content) + self.base = 2 + if self.value is None: + self.analysis_warning = True + # post-analyse + def set_bits(self, bits): + self.bits = bits + def bin_repr(self): + if self.value is None: + return 'None' + bin = to_bin(self.value) + if self.bits is None: + return bin + 'b' + else: + return ('0' * (self.bits - len(bin))) + bin + 'b' + def hex_repr(self): + if self.value is None: + return 'None' + if self.bits is None: + return "%xh" % self.value + else: + nibs = (self.bits + 3) / 4 + return ("%%0%dxh" % nibs) % self.value + def simple_repr(self): + if self.value is None: + return 'None' + bh = (self.bin_repr(), self.hex_repr()) + if self.base == 2: + return "[ %s ]\t %s " % bh + else: + return " %s \t[ %s ]" % bh + def simple_lines(self): + return ["%-11s %s%s" % (self.col_name, self.simple_repr(), self.warn())] + + + # Line PageTE +INSERTED = State('INSERTED') # X X +NEW_LINE = State('NEW_LINE') # X +INVALID_LINE = State('INVALID_LINE') # X +NOT_HANDLED = State('NOT_HANDLED') # X +END_OF_PAGE = State('END_OF_PAGE') # X + +VALID_TE_RANGE = re.compile(r'[-0-9 :]').match + +class Line(object): + def __init__(self, top): + # direct symbolic cell access + # prefixed by c_ cause i might also use unprefixed attrs in the + # future and directly put payloads there. + self.c_range = CellRange (COL_RANGE, 'range') + self.c_acronym = CellSymbol (COL_ACRONYM, 'acronym') + self.c_description = Cell (COL_DESCRIPTION, 'description') + self.c_sticky = CellSticky (COL_STICKY, 'sticky') + self.c_reset = CellReset (COL_RESET, 'reset') + self.c_access = CellAccess (COL_ACCESS, 'access') + # access by columns index: + self.cells = [ + self.c_range, + self.c_acronym, + self.c_description, + self.c_sticky, + self.c_reset, + self.c_access, + ] + self.top = top + self.bottom = None + self.bits = None + def is_empty(self): + for c in self.cells: + if not c.is_empty(): + return False + return True + def try_insert_te(self, column, te): + if (self.bottom is None) or (te.top < self.bottom): + # NOTE: this only works if the range cell contains only one line + if (self.bottom is None) and (column == COL_RANGE): + if not VALID_TE_RANGE(te.text): + return INVALID_LINE + self.bottom = 2 * te.top - self.top + if (self.bottom is not None) \ + and (te.top + te.height - 1 > self.bottom): + self.bottom = te.top + te.height - 1 + self.cells[column].insert_te(te) + return INSERTED + else: + return NEW_LINE + def simple_print(self): + for c in self.cells: + for l in c.simple_lines(): + print l + def value_print(self, value): + cell_value = copy(self.c_reset) + bit_base = self.c_range.range[1] + bit_mask = (1 << cell_value.bits) - 1 + cell_value.value = (value >> bit_base) & bit_mask + cell_value.col_name = "value" + for c in self.cells[:2] + [cell_value] + self.cells[2:]: + for l in c.simple_lines(): + print l + def analyse_line(self): + for c in self.cells: + c.analyse() + self.bits = self.c_range.bits_width() + self.c_reset.set_bits(self.bits) + + +def strip_strict_ascii(s): + return s.strip().encode('ascii', 'strict') + +def parse_offset(s): + m = re.search(r"([0-9a-f]+)h(?:\s+at\s+([0-9a-f]+)h)? *$", s, re.I) + if m: + base, inc = m.groups() + base = int(base, 16) + if inc: + inc = int(inc, 16) + return base, inc + else: + raise ValueError("can't parse offset %r" % s) + +def format_offset(off): + if off is None: + return "-" + if isinstance(off, basestring): + return off + else: + return "%02Xh" % off + +def parse_size(s): + m = re.match(r"(\d+)\s+bit", s, re.I) + if m: + return int(m.group(1)) + else: + raise ValueError("can't parse size %r" % s) + +def parse_default(s): + s = s.strip() + if not s.endswith('h'): + # raise ValueError("can't parse default value %r" % s) + return s + try: + return int(s[:-1], 16) + except ValueError: + if "X" in s: # gruik hack + return s + raise + +def fmt_identity(t, x): + return x + +def fmt_hex(t, x): + return "%xh" % x + +def fmt_offset_start_end(t, x): + if x[1] is None: + return "%xh" % x[0] + else: + return "%xh at %xh" % x + +def fmt_default(t, x): + if isinstance(x, basestring): + return x + nibs = (t.size + 3) / 4 + return ("%%0%dxh" % nibs) % x + + +FD = namedtuple('FD', 'attr_name adj_y adj_x parser') + +class Table(object): + + # WARNING: + # offset_end often contains random information in recurring registers + + # attr_name adj_y adj_x parser + Fields = { 'Description:': FD('description', 2, 5, strip_strict_ascii), + 'View:': FD('view', 2, 5, strip_strict_ascii), + 'BAR:': FD('bar', 2, 5, strip_strict_ascii), + 'Offset Start:': FD('offset_start', 2, 5, parse_offset), + 'Offset End:': FD('offset_end', 2, 5, parse_offset), + 'Power Well:': FD('power_well', 2, 5, strip_strict_ascii), + 'Size:': FD('size', 2, 5, parse_size), + 'Default:': FD('default', 2, 5, parse_default), + 'Bus:Device:Function:': FD('bus_device_function', 2, 5, strip_strict_ascii) } + + HeaderAttr = [ + # from table title: + ('table_ref', fmt_identity), + ('offset', fmt_hex), + ('reg_name', fmt_identity), + ('recurring', fmt_identity), + ('reg_base_name', fmt_identity), + ('title_desc', fmt_identity), + + # from table header: + ('description', fmt_identity), + ('view', fmt_identity), + ('bar', fmt_identity), + ('offset_start', fmt_offset_start_end), + ('offset_end', fmt_offset_start_end), + ('power_well', fmt_identity), + ('size', fmt_identity), + ('default', fmt_default), + ('bus_device_function', fmt_identity), + ] + HeaderAttrDict = dict(HeaderAttr) + + START_LINE_MAX_HEIGHT = 11 # max interval _between_ title lines + COL_HEADER_HEIGHT = 26 + def __init__(self, chapter, table_num, offset, reg_name, title_desc): + self.chapter = chapter + self.table_num = table_num + self.offset = offset + self.reg_name = reg_name + self.title_desc = title_desc + self.table_ref = "%d-%d" % (self.chapter, self.table_num) + # + self.recurring, self.reg_base_name = self.parse_reg_name(reg_name) + # + self.description = None + self.view = None + self.bar = None + self.offset_start = None + self.offset_end = None + self.power_well = None + self.size = None + self.default = None + self.bus_device_function = None + # + self.columns = None + self.cur_line = None + self.lines = [] + @staticmethod + def parse_reg_name(reg_name): + m = re.match(r"(.*?)[[](\d+)[-:](\d+)[]]", reg_name) + if not m: + return None, reg_name + reg_base_name, left, right = m.groups() + left = int(left) + right = int(right) + low = min(left, right) + high = max(left, right) + if low != 0: + print ("WARNING: don't know how to handle recurring register %s " + "not starting at index 0. Loading as non recurring for now." + % reg_name) + return None, reg_base_name + return high + 1, reg_base_name + def attrs_set_abstract(self): + del self.columns + del self.cur_line + def clone_abstract(self): + """ + Clone the abstract part of this table, that is everything needed for + extraction of payload informations. + result.columns and result.cur_line are deleted. + self.lines is shallow copied. => result.lines = self.lines[:] + """ + result = copy(self) + result.attrs_set_abstract() + result.lines = self.lines[:] + return result + def extend_abstract(self, other): + """ + Once a first 'concrete' table has been cloned to an abstract version, + it can be extended with other parts of the same table (from subsequent + pages). Example if a table is splitted between (end of) page p_1 and + (start of page) p_2: + >>> pte_1 = PageTE(p_1, profile) + >>> pte_2 = PageTE(p_2, profile) + >>> full_table = pte_1.tables[-1].clone_abstract() + >>> full_table.extend_abstract(pte_2.tables[0]) + """ + self.lines.extend(other.lines) + def header_mismatch(self, other): + return [ + (attr, getattr(self, attr), getattr(other, attr)) + for (attr, _) in self.HeaderAttr + if getattr(self, attr) != getattr(other, attr) + ] + def use_columns_pos(self): + self.columns = ColumnsPos() + def cat_start_line(self, text): + self.title_desc += text.encode('ascii', 'strict') + def set_field_parse(self, fd, s): + try: + setattr(self, fd.attr_name, fd.parser(s)) + except: + print "exception while parsing %r" % vars(self) + raise + def close(self): + self.title_desc = re.sub(r'\s*\(Sheet .* of .*\)\s*', ' ', + self.title_desc).strip() + self.cur_line = None + def start_content(self): + self.cur_line = Line(self.columns.ver_pos + self.COL_HEADER_HEIGHT / 2) + self.lines.append(self.cur_line) + def insert_content_te(self, column, te): + res = self.cur_line.try_insert_te(column, te) + if res == INSERTED: + return True + elif res == NEW_LINE: + if self.cur_line.bottom is None: + next_top = te.top + 1 - te.height + else: + next_top = self.cur_line.bottom + self.cur_line = Line(next_top) + self.lines.append(self.cur_line) + self.cur_line.try_insert_te(column, te) + return True + else: # INVALID_LINE + self.lines = self.lines[:-1] + if self.lines: + self.cur_line = self.lines[-1] + else: + self.cur_line = None + return False + def analyse_content(self): + for l in self.lines: + l.analyse_line() + def check_ranges(self): + rembits = self.size + for l in self.lines: + if (l.c_range.range is None) or (rembits != l.c_range.range[0] + 1): + return False + rembits = l.c_range.range[1] + return rembits == 0 + def auto_offset(self): + if self.offset is None: + self.offset = self.offset_start[0] + def fmt(self, field_name): + fmt_func = self.HeaderAttrDict[field_name] + return fmt_func(self, getattr(self, field_name)) + def value_print(self, value): + print '============================================================' + print '%xh' % self.offset, self.reg_name + for h, fmt in self.HeaderAttr: + print " %-15s %s" % (h, fmt(self, getattr(self, h))) + print + for l in self.lines: + print " ----------------" + l.value_print(value) + print + print + + +class ProfileBase(object): + """ + To properly parse register tables from the informally written 320066 + datasheet, it is typically needed to perform some adjustments during + the dispatching of text elements, and/or after the parse stage itself + on abstracted tables. Also some tables need to be blacklisted, and the + page range to be defined. + + Derive this class to implement such a register set profile, + then register it by calling register_profile(derived_class) + """ + + name = None # string - symbolic name of the register set + min_page = None # int - first page to parse (included) + max_page = None # int - last page to parse (included) + + @staticmethod + def special_te_mapper(ps, te, table): + """ + This method can be used to help implement the .special_replace() method. + ps: Parse State - instance of PageTE.ParseState + te: Text Element - instance of SomeText + table: replacement (hash) table, ex: + { + (432, TE_Id(text=u'N 00001b', top=384, left=409, width=60, height=8)): + [TE_Id(text=u'N', top=384, left=409, width=4, height=8), + TE_Id(text=u'00001b', top=384, left=427, width=50, height=8)], + } + The key is (page_num, te_id) where te_id describes the original text + element which could not be fitted in a table cell. + The value is a list of TE_Id, to replace the offending one. + """ + key = (ps.pte.page_num, te.identity()) + repl_ids = table.get(key) + if repl_ids is not None: + return [SomeText.special_from_id(p) for p in repl_ids] + + def special_replace(self, ps, te): + return None + + def post_fix(self, a_t): + pass + + def table_blacklisted(self, chapter, table_num): + """ + chapter: int + table_num: int + """ + return False + + +_profiles = {} + +def register_profile(profile_class): + if not isinstance(profile_class.name, basestring): + raise ValueError("profile_class should be a string") + _profiles[profile_class.name] = profile_class + + +def profile_factory(profile_name): + if profile_name not in _profiles: + raise ValueError("profile %s does not exist" % profile_name) + profile_class = _profiles[profile_name] + return profile_class() + + +# Unhandled Description +UD = namedtuple('UD', 'te table_ref reg_name') + +ParsedTitle = namedtuple('ParsedTitle', + 'chapter table_num offset reg_name title_desc') + +class PageTE(object): + + class ParseState(object): + """ + This contains attributes that are shared between PageTE methods + during the parse stage, but really conceptually are local vars. + I put them all here because it's easier to get ride of them all + like that, and also less error prone. Plus they are clearly + identified, so user code won't try to do stupid things and hopefully + the mess in PageTE will be kept limited. + """ + NOP = State('NOP') + TITLE = State('TITLE') + HEADER = State('HEADER') + COL_HEADER = State('COL_HEADER') + CONTENT = State('CONTENT') + def __init__(self, pte, et_page, profile): + # tels: text elements + # curt: current table + # st: state (one of the above) + # + self.unhandled = {# no NOP + self.TITLE: [], + self.HEADER: [], + self.COL_HEADER: [], + self.CONTENT: []} + self.pte = pte + self.tels = [SomeText.from_et(elem) + for elem in et_page.findall('text')] + self.tels.sort(key=lambda elem: (elem.top, elem.left)) + self.tels_X = [[] for i in xrange(self.pte.page_width)] + self.tels_Y = [[] for i in xrange(self.pte.page_height)] + for te in self.tels: + self.tels_X[te.left].append(te) + self.tels_Y[te.top].append(te) + self.curt = None + self.st = self.NOP + self.title_y = None + self.profile = profile + def close_curt(self): + if self.curt: + self.curt.close() + self.curt = None + self.st = self.NOP + def find_te_around(self, y, x, ay, ax): + y_min = max(0, y - ay) + y_max = min(self.pte.page_height, y + ay + 1) + x_min = max(0, x - ax) + x_max = min(self.pte.page_width, x + ax + 1) + for tel_line in self.tels_Y[y_min:y_max]: + for te in tel_line: + if x_min <= te.left < x_max: + return te + + WARN_EMPTY = False + + MIN_MATCH_COL = 0.9 + + # any text under this point marks and end of page + PAGE_BOTTOM = 732 # gruik hack + TABLE_CHAR_MAX_SIZE = 10 # quite gruik too + + start_search = re.compile(ur""" + Table\s+(\d+)-(\d+)\s*\.\s* # <chapter>-<table_num> + (?:Offset\s+([0-9a-f]+)h\s*:\s*)? # <offset> + (\S+?(?:[[]\S+?[]])?)(?:\s?[-\u2013:])+ # <reg_name> + (.*)$ # <title_desc> + """, re.I | re.VERBOSE).search + + def parse_table_title(self, ps, text): + m = self.start_search(text) + if not m: + return None + chapter = int(m.group(1)) + table_num = int(m.group(2)) + offset_group = m.group(3) + offset = (offset_group and int(offset_group, 16)) + if ps.profile.table_blacklisted(chapter, table_num): + return None + return ParsedTitle( + chapter=chapter, + table_num=table_num, + offset=offset, + reg_name=m.group(4).encode('ascii', 'strict'), + title_desc=m.group(5).encode('ascii', 'strict')) + + @staticmethod + def PteTable(parsed_title): + pt = Table(parsed_title.chapter, + parsed_title.table_num, + parsed_title.offset, + parsed_title.reg_name, + parsed_title.title_desc) + pt.use_columns_pos() + return pt + + # note: TEs are usually sorted by (y, x), but not strictly + # in case of a special case. Don't do insane things. + def _process_te(self, ps, te): + if te.top >= self.PAGE_BOTTOM: + return END_OF_PAGE + parsed_title = self.parse_table_title(ps, te.text) + if parsed_title: + ps.close_curt() + ps.curt = self.PteTable(parsed_title) + self.tables.append(ps.curt) + ps.title_y = te.top + ps.st = ps.TITLE + return INSERTED + if (ps.st in (ps.TITLE, ps.HEADER)) and (te.text in Table.Fields): + ps.st = ps.HEADER + fd = Table.Fields[te.text] + ote = ps.find_te_around(te.top, + te.left + te.width, + fd.adj_y, fd.adj_x) + if ote: + ote.handled = True + ps.curt.set_field_parse(fd, ote.text) + else: + if self.WARN_EMPTY: + print ("WARNING: page %d: empty field %r in " + "Table %s Offset %s: %s - %s") % \ + (self.page_num, te.text, + ps.curt.table_ref, format_offset(ps.curt.offset), + ps.curt.reg_name, ps.curt.title_desc) + return INSERTED + if ps.st == ps.TITLE: + if te.top < ps.title_y + ps.curt.START_LINE_MAX_HEIGHT: + ps.curt.cat_start_line(te.text) + ps.title_y = te.top + return INSERTED + # else nothing + if ps.st in (ps.HEADER, ps.COL_HEADER): + r = ps.curt.columns.try_column_header(te, + # info for error traces: + {'page': self.page_num, + 'table': ps.curt.table_ref}) + if r: + ps.st = ps.COL_HEADER + return INSERTED + elif ps.st == ps.COL_HEADER: + ps.curt.start_content() + ps.st = ps.CONTENT + if DBG: + print "RAW", ps.curt.table_ref, ps.curt.reg_name, ps.curt.columns.ver_pos, dbg_fmt_hor_pos(ps.curt.columns.raw_hor_pos) + print "POS", ps.curt.table_ref, ps.curt.reg_name, ps.curt.columns.ver_pos, dbg_fmt_hor_pos(ps.curt.columns.hor_pos) + if ps.st == ps.CONTENT: + if te.height > self.TABLE_CHAR_MAX_SIZE: + ps.close_curt() + return INSERTED + lvl = [None] + column = ps.curt.columns.search(te.left, te.left + te.width, + self.MIN_MATCH_COL, lvl) + if DBG: + print "III", ps.curt.table_ref, ps.curt.reg_name, lvl[0], te + if column is not None: + # XXX log dismissed one that do not close the table? + # need to add a special return value to insert_content_te + # to do that + if not ps.curt.insert_content_te(column, te): + ps.close_curt() + return INSERTED + elif not te.special: + spe_tels = self.try_special(ps, te) + if spe_tels is not None: + if not spe_tels: + return INSERTED + result = NOT_HANDLED + for spe_te in spe_tels: + spe_r = self.process_te(ps, spe_te) + if spe_r == END_OF_PAGE: + return END_OF_PAGE + elif spe_r == INSERTED: + result = INSERTED + if result == INSERTED: + return INSERTED + if ps.st in ps.unhandled: + ps.unhandled[ps.st].append(UD(te, ps.curt.table_ref, + ps.curt.reg_name)) + return NOT_HANDLED + + def process_te(self, ps, te): + """ + factors out handling of te.handled from _process_te + """ + if te.handled: + return INSERTED + r = self._process_te(ps, te) + if r != NOT_HANDLED: + te.handled = True + return r + + def try_special(self, ps, te): + """ + Returns None if this te is not to be handled as a special case. + Else returns a list of te for which insertion should be tried. + IMPORTANT: + It is important that each of these te have its 'special' attribute + set to True so that infinite recursion is reliably avoided. + (special te won't result in a call to try_special if they are + not handled) + """ + return ps.profile.special_replace(ps, te) + + def __init__(self, et_page, profile): + self.page_num = int(et_page.attrib['number']) + if DBG: + print "PAGE", self.page_num + + self.page_height = int(et_page.attrib['height']) + self.page_width = int(et_page.attrib['width']) + + self.tables = [] + + ps = self.ParseState(self, et_page, profile) + + for te in ps.tels: + if self.process_te(ps, te) == END_OF_PAGE: + break + ps.close_curt() + + for state, unh_tels in ps.unhandled.iteritems(): + for ud in unh_tels: + if not ud.te.handled: + print "WWW unhandled %r ### %r" % \ + ((self.page_num, ud.te.identity()), + (ud.table_ref, ud.reg_name, state)) + + +class AbsTables(object): + def __init__(self): + self.tables = [] + self.by_ref = {} + self.by_base_name = {} + def append_table(self, t): + if t.table_ref not in self.by_ref: + if t.reg_base_name in self.by_base_name: + raise ValueError("already have a table (ref: %s) " + "for register %s" % + (self.by_base_name[t.reg_base_name].table_ref, + t.reg_base_name)) + new_abs_t = t.clone_abstract() + self.tables.append(new_abs_t) + self.by_ref[t.table_ref] = new_abs_t + self.by_base_name[t.reg_base_name] = new_abs_t + else: + ml = self.by_ref[t.table_ref].header_mismatch(t) + if ml: + print "MISMATCH %s:" % t.table_ref, ml + self.by_ref[t.table_ref].extend_abstract(t) + def find_by_base_name(self, base_name): + return self.by_base_name.get(base_name) + def find_by_offset(self, offset): + for t in self.tables: + if offset == t.offset: + return t + if t.recurring and (offset > t.offset): + delta = offset - t.offset + if (delta % t.offset_start[1]) == 0 \ + and (delta / t.offset_start[1]) < t.recurring: + return t + # for use in post_fix + def auto_tables_offset(self): + for t in self.tables: + t.auto_offset() + + +def abs_tables_from_pages(pages, profile): + """ + Simple wrapper to concatenate and analyse tables from loaded pages. + """ + a_t = AbsTables() + for p in pages: + pte = PageTE(p, profile) + for t in pte.tables: + a_t.append_table(t) + for t in a_t.tables: + t.analyse_content() + profile.post_fix(a_t) + return a_t |