#!/usr/bin/python import xml.etree.cElementTree as ET from operator import attrgetter, itemgetter import re from collections import namedtuple from copy import copy DBG = 0 def filter_pages(doc, min_page, max_page): return [page for page in doc.getroot().getchildren() if min_page <= int(page.attrib['number']) <= max_page] def load_datasheet_pages(filepath, min_page, max_page): doc = ET.parse(filepath) return filter_pages(doc, min_page, max_page) ASCIIFY_TABLE = { u'\u201c': '"', u'\u201d': '"', u'\u2019': "'", u'\u2018': "'", u'\u2013': "-", } def asciify(us): result = [ASCIIFY_TABLE.get(a, a).encode('ascii', 'replace') for a in us] return ''.join(result) class State(object): __slots__ = ['name'] def __init__(self, name): self.name = name def __repr__(self): return '<%s>' % self.name def _get_all_text_rec(elem, parts): if elem.text: parts.append(elem.text) for c in elem.getchildren(): _get_all_text_rec(c, parts) if c.tail: parts.append(c.tail) return parts def get_all_text(elem): return u''.join(_get_all_text_rec(elem, [])) TE_Id = namedtuple('TE_Id', 'text top left width height') class SomeText(object): __slots__ = ['text', 'top', 'left', 'width', 'height', 'handled', 'special'] def __init__(self, text, top, left, width, height, special): self.text = text self.top = top self.left = left self.width = width self.height = height self.special = special self.handled = False @classmethod def from_et(cls, elem): return cls(get_all_text(elem), int(elem.attrib['top']), int(elem.attrib['left']), int(elem.attrib['width']), int(elem.attrib['height']), special=False) @classmethod def special_from_id(cls, i): return cls(i.text, i.top, i.left, i.width, i.height, special=True) def identity(self): return TE_Id(self.text, self.top, self.left, self.width, self.height) def __repr__(self): return '' % \ (self.top, self.left, self.width, self.height, self.text, " special" if self.special else "") def dbg_fmt_hor_pos(hp): return sorted(hp.items(), key=itemgetter(1)) # In left to right order: # WARNING: this should not ever need to change, but if it does after all # you should carefully review this whole file... COL_RANGE = 0 COL_ACRONYM = 1 COL_DESCRIPTION = 2 COL_STICKY = 3 COL_RESET = 4 COL_ACCESS = 5 COL_NUMBER = 6 CD = namedtuple('CD', 'width idx') class ColumnsPos(object): Columns = { 'bit range': CD(60, COL_RANGE), 'bit acronym': CD(58, COL_ACRONYM), 'bit description': CD(204, COL_DESCRIPTION), 'sticky': CD(28, COL_STICKY), 'bit reset': CD(58, COL_RESET), 'bit access': CD(51, COL_ACCESS), } ADJUST = 10 def __init__(self): self.hor_pos = dict([(k, None) for k in self.Columns.iterkeys()]) self.ver_pos = None # DBG: self.raw_hor_pos = dict(self.hor_pos) def overlap(self, txt, left, right): for k, a in self.hor_pos.iteritems(): if a and (left < a[1]) and not (right <= a[0]): return k def find_k_left(self, left): for k, a in self.hor_pos.iteritems(): if a and (left - self.ADJUST <= a[1] <= left + self.ADJUST): return k def find_k_right(self, right): for k, a in self.hor_pos.iteritems(): if a and (right - self.ADJUST <= a[0] <= right + self.ADJUST): return k def adjust(self, pos_a, cd_a, pos_b, cd_b): w = cd_a.width + cd_b.width # the longest can be adjusted more: return ((pos_a * cd_b.width) # move less if the other is longer + (pos_b * cd_a.width) + w/2) / w def try_column_header(self, te, dbg=""): txt = te.text.strip().lower() if txt == 'bit range': self.ver_pos = te.top cd = self.Columns.get(txt) if (not cd) and txt == 'value': cd = self.Columns['bit reset'] if not cd: return False left = te.left + te.width / 2 - cd.width / 2 right = left + cd.width if txt == 'value': o_l, o_r = self.hor_pos['bit reset'] if not (o_l - self.ADJUST <= left <= o_l + self.ADJUST) \ or not (o_r - self.ADJUST <= right <= o_r + self.ADJUST): print "WARNING: %r:" % dbg, \ "'value' (%d, %d) not haligned with 'bit reset'" \ % (left, right), \ "### %r" % vars(self) return True # DBG: self.raw_hor_pos[txt] = [left, right] k_left = self.find_k_left(left) if k_left: near_left = self.hor_pos[k_left][1] cd_left = self.Columns[k_left] if cd_left.idx + 1 != cd.idx: print "WARNING: %r:" % dbg, \ "column left of '%s' should not be '%s'" % (txt, k_left) self.hor_pos[k_left][1] = left = self.adjust(left, cd, near_left, cd_left) k_right = self.find_k_right(right) if k_right: near_right = self.hor_pos[k_right][0] cd_right = self.Columns[k_right] if cd_right.idx - 1 != cd.idx: print "WARNING: %r:" % dbg, \ "column right of '%s' should not be '%s'" % (txt, k_right) self.hor_pos[k_right][0] = right = self.adjust(right, cd, near_right, cd_right) other_k = self.overlap(txt, left, right) if other_k: raise ValueError("column %r (%d, %d) overlaps with %r (%d, %d)" % (txt, left, right, other_k, self.hor_pos[other_k][0], self.hor_pos[other_k][1])) self.hor_pos[txt] = [left, right] return True @staticmethod def matching_level(l, r, l_ref, r_ref): l_com = max(l, l_ref) r_com = min(r, r_ref) if l_com >= r_com: return 0.0 else: return (r_com - l_com) / float(r - l) def search(self, left, right, min_match_level, out_level=None): for k, (l_col, r_col) in self.hor_pos.iteritems(): level = self.matching_level(left, right, l_col, r_col) if level >= min_match_level: if out_level is not None: out_level[0] = level return self.Columns[k].idx class Cell(object): ADJ_X = 5 def __init__(self, column, col_name): self.paragraphs = [] self.last_te = None self.column = column self.col_name = col_name def is_empty(self): return self.paragraphs == [] def insert_te(self, te): txt = asciify(te.text) if self.last_te \ and (te.top <= self.last_te.top + self.last_te.height): self.paragraphs[-1] += txt else: self.paragraphs.append(txt) self.last_te = te def analyse(self): self.analysis_warning = False # post-analyse def simple_lines(self): res = [self.col_name] res.extend([("\t" + p) for p in self.paragraphs]) return res def simple_datastructure(self): return (self.col_name, self.paragraphs) class CellSymbol(Cell): WARN_EMPTY = True def analyse(self): if self.is_empty(): self.content = '' self.analysis_warning = self.WARN_EMPTY else: self.content = self.paragraphs[0].strip() self.analysis_warning = (len(self.paragraphs) > 1) def warn(self): return "\t\t" if self.analysis_warning else "" def simple_lines(self): return ["%-13s %s%s" % (self.col_name, self.content, self.warn())] def simple_datastructure(self): return (self.col_name, self.content) class CellRange(CellSymbol): def _parse(self): m = re.match(r"(\d+)[- :]+(\d+)", self.content) if m: self.range = (int(m.group(1)), int(m.group(2))) return try: single_bit = int(self.content) self.range = (single_bit, single_bit) return except ValueError: pass def analyse(self): CellSymbol.analyse(self) self.range = None self._parse() if self.range is None: self.analysis_warning = True def bits_width(self): if self.range is None: return None return self.range[0] + 1 - self.range[1] def str_range(self): if self.range is None: return 'None' elif self.range[0] == self.range[1]: return "%d" % self.range[0] else: return "%d-%d" % self.range def simple_lines(self): return ["%-13s %s%s" % (self.col_name, self.str_range(), self.warn())] def simple_datastructure(self): return (self.col_name, self.range) class CellConstraint(CellSymbol): # abstract class, derivate and add e.g. # ALLOWED = ('RO', 'RW') def analyse(self): CellSymbol.analyse(self) if self.content not in self.ALLOWED: self.analysis_warning = True class CellSticky(CellConstraint): WARN_EMPTY = False # for upper layers ALLOWED = ('Y', 'N', '') class CellAccess(CellConstraint): ALLOWED = ('RW', 'RO', 'RWS', # <= enough for smrbase 'RWC', 'RWO', 'RW0C', # + imch_conf 'RWL', # + imch_conf 'RV', 'RCWC', 'WO', 'RC', # + gbe 'RO/RWC') # + gbe def to_bin(i): if i == 0: return '0' r = "" while i: r = str(i & 1) + r i /= 2 return r class CellReset(CellSymbol): def analyse(self): CellSymbol.analyse(self) self.value = None self.base = None self.bits = None if self.content.endswith('b'): self.base = 2 elif self.content.endswith('h'): self.base = 16 if self.base: try: self.value = int(self.content[:-1], self.base) except ValueError: pass if self.content in ('0', '1'): self.value = int(self.content) self.base = 2 if self.value is None: self.analysis_warning = True # post-analyse def set_bits(self, bits): self.bits = bits def bin_repr(self): if self.value is None: return 'None' bin = to_bin(self.value) if self.bits is None: return bin + 'b' else: return ('0' * (self.bits - len(bin))) + bin + 'b' def hex_repr(self): if self.value is None: return 'None' if self.bits is None: return "%xh" % self.value else: nibs = (self.bits + 3) / 4 return ("%%0%dxh" % nibs) % self.value def simple_repr(self): if self.value is None: return 'None' bh = (self.bin_repr(), self.hex_repr()) if self.base == 2: return "[ %s ]\t %s " % bh else: return " %s \t[ %s ]" % bh def simple_lines(self): return ["%-11s %s%s" % (self.col_name, self.simple_repr(), self.warn())] def simple_datastructure(self): return (self.col_name, self.value) # Line PageTE INSERTED = State('INSERTED') # X X NEW_LINE = State('NEW_LINE') # X INVALID_LINE = State('INVALID_LINE') # X NOT_HANDLED = State('NOT_HANDLED') # X END_OF_PAGE = State('END_OF_PAGE') # X VALID_TE_RANGE = re.compile(r'[-0-9 :]').match class Line(object): def __init__(self, top): # direct symbolic cell access # prefixed by c_ cause i might also use unprefixed attrs in the # future and directly put payloads there. self.c_range = CellRange (COL_RANGE, 'range') self.c_acronym = CellSymbol (COL_ACRONYM, 'acronym') self.c_description = Cell (COL_DESCRIPTION, 'description') self.c_sticky = CellSticky (COL_STICKY, 'sticky') self.c_reset = CellReset (COL_RESET, 'reset') self.c_access = CellAccess (COL_ACCESS, 'access') # access by columns index: self.cells = [ self.c_range, self.c_acronym, self.c_description, self.c_sticky, self.c_reset, self.c_access, ] self.top = top self.bottom = None self.bits = None def is_empty(self): for c in self.cells: if not c.is_empty(): return False return True def try_insert_te(self, column, te): if (self.bottom is None) or (te.top < self.bottom): # NOTE: this only works if the range cell contains only one line if (self.bottom is None) and (column == COL_RANGE): if not VALID_TE_RANGE(te.text): return INVALID_LINE self.bottom = 2 * te.top - self.top if (self.bottom is not None) \ and (te.top + te.height - 1 > self.bottom): self.bottom = te.top + te.height - 1 self.cells[column].insert_te(te) return INSERTED else: return NEW_LINE def simple_print(self): for c in self.cells: for l in c.simple_lines(): print l def value_print(self, value): cell_value = copy(self.c_reset) bit_base = self.c_range.range[1] bit_mask = (1 << cell_value.bits) - 1 cell_value.value = (value >> bit_base) & bit_mask cell_value.col_name = "value" for c in self.cells[:2] + [cell_value] + self.cells[2:]: for l in c.simple_lines(): print l def analyse_line(self): for c in self.cells: c.analyse() self.bits = self.c_range.bits_width() self.c_reset.set_bits(self.bits) def strip_strict_ascii(s): return s.strip().encode('ascii', 'strict') def parse_offset(s): m = re.search(r"([0-9a-f]+)h(?:\s+at\s+([0-9a-f]+)h)? *$", s, re.I) if m: base, inc = m.groups() base = int(base, 16) if inc: inc = int(inc, 16) return base, inc else: raise ValueError("can't parse offset %r" % s) def format_offset(off): if off is None: return "-" if isinstance(off, basestring): return off else: return "%02Xh" % off def parse_size(s): m = re.match(r"(\d+)\s+bit", s, re.I) if m: return int(m.group(1)) else: raise ValueError("can't parse size %r" % s) def parse_default(s): s = s.strip() if not s.endswith('h'): # raise ValueError("can't parse default value %r" % s) return s try: return int(s[:-1], 16) except ValueError: if "X" in s: # gruik hack return s raise def fmt_identity(t, x): return x def fmt_hex(t, x): return "%xh" % x def fmt_offset_start_end(t, x): if x[1] is None: return "%xh" % x[0] else: return "%xh at %xh" % x def fmt_default(t, x): if isinstance(x, basestring): return x nibs = (t.size + 3) / 4 return ("%%0%dxh" % nibs) % x FD = namedtuple('FD', 'attr_name adj_y adj_x parser') class Table(object): # WARNING: # offset_end often contains random information in recurring registers # attr_name adj_y adj_x parser Fields = { 'Description:': FD('description', 2, 5, strip_strict_ascii), 'View:': FD('view', 2, 5, strip_strict_ascii), 'BAR:': FD('bar', 2, 5, strip_strict_ascii), 'Offset Start:': FD('offset_start', 2, 5, parse_offset), 'Offset End:': FD('offset_end', 2, 5, parse_offset), 'Power Well:': FD('power_well', 2, 5, strip_strict_ascii), 'Size:': FD('size', 2, 5, parse_size), 'Default:': FD('default', 2, 5, parse_default), 'Bus:Device:Function:': FD('bus_device_function', 2, 5, strip_strict_ascii) } HeaderAttr = [ # from table title: ('table_ref', fmt_identity), ('offset', fmt_hex), ('reg_name', fmt_identity), ('recurring', fmt_identity), ('reg_base_name', fmt_identity), ('title_desc', fmt_identity), # from table header: ('description', fmt_identity), ('view', fmt_identity), ('bar', fmt_identity), ('offset_start', fmt_offset_start_end), ('offset_end', fmt_offset_start_end), ('power_well', fmt_identity), ('size', fmt_identity), ('default', fmt_default), ('bus_device_function', fmt_identity), ] HeaderAttrDict = dict(HeaderAttr) START_LINE_MAX_HEIGHT = 11 # max interval _between_ title lines COL_HEADER_HEIGHT = 26 def __init__(self, chapter, table_num, offset, reg_name, title_desc): self.chapter = chapter self.table_num = table_num self.offset = offset self.reg_name = reg_name self.title_desc = title_desc self.table_ref = "%d-%d" % (self.chapter, self.table_num) # self.recurring, self.reg_base_name = self.parse_reg_name(reg_name) # self.description = None self.view = None self.bar = None self.offset_start = None self.offset_end = None self.power_well = None self.size = None self.default = None self.bus_device_function = None # self.columns = None self.cur_line = None self.lines = [] @staticmethod def parse_reg_name(reg_name): m = re.match(r"(.*?)[[](\d+)[-:](\d+)[]]", reg_name) if not m: return None, reg_name reg_base_name, left, right = m.groups() left = int(left) right = int(right) low = min(left, right) high = max(left, right) if low != 0: print ("WARNING: don't know how to handle recurring register %s " "not starting at index 0. Loading as non recurring for now." % reg_name) return None, reg_base_name return high + 1, reg_base_name def attrs_set_abstract(self): del self.columns del self.cur_line def clone_abstract(self): """ Clone the abstract part of this table, that is everything needed for extraction of payload informations. result.columns and result.cur_line are deleted. self.lines is shallow copied. => result.lines = self.lines[:] """ result = copy(self) result.attrs_set_abstract() result.lines = self.lines[:] return result def extend_abstract(self, other): """ Once a first 'concrete' table has been cloned to an abstract version, it can be extended with other parts of the same table (from subsequent pages). Example if a table is splitted between (end of) page p_1 and (start of page) p_2: >>> pte_1 = PageTE(p_1, profile) >>> pte_2 = PageTE(p_2, profile) >>> full_table = pte_1.tables[-1].clone_abstract() >>> full_table.extend_abstract(pte_2.tables[0]) """ self.lines.extend(other.lines) def header_mismatch(self, other): return [ (attr, getattr(self, attr), getattr(other, attr)) for (attr, _) in self.HeaderAttr if getattr(self, attr) != getattr(other, attr) ] def use_columns_pos(self): self.columns = ColumnsPos() def cat_start_line(self, text): self.title_desc += text.encode('ascii', 'strict') def set_field_parse(self, fd, s): try: setattr(self, fd.attr_name, fd.parser(s)) except: print "exception while parsing %r" % vars(self) raise def close(self): self.title_desc = re.sub(r'\s*$Sheet .* of .*$\s*', ' ', self.title_desc).strip() self.cur_line = None def start_content(self): self.cur_line = Line(self.columns.ver_pos + self.COL_HEADER_HEIGHT / 2) self.lines.append(self.cur_line) def insert_content_te(self, column, te): res = self.cur_line.try_insert_te(column, te) if res == INSERTED: return True elif res == NEW_LINE: if self.cur_line.bottom is None: next_top = te.top + 1 - te.height else: next_top = self.cur_line.bottom self.cur_line = Line(next_top) self.lines.append(self.cur_line) self.cur_line.try_insert_te(column, te) return True else: # INVALID_LINE self.lines = self.lines[:-1] if self.lines: self.cur_line = self.lines[-1] else: self.cur_line = None return False def analyse_content(self): for l in self.lines: l.analyse_line() def check_ranges(self): rembits = self.size for l in self.lines: if (l.c_range.range is None) or (rembits != l.c_range.range[0] + 1): return False rembits = l.c_range.range[1] return rembits == 0 def auto_offset(self): if self.offset is None: self.offset = self.offset_start[0] def fmt(self, field_name): fmt_func = self.HeaderAttrDict[field_name] return fmt_func(self, getattr(self, field_name)) def value_print(self, value): print '============================================================' print '%xh' % self.offset, self.reg_name for h, fmt in self.HeaderAttr: print " %-15s %s" % (h, fmt(self, getattr(self, h))) print for l in self.lines: print " ----------------" l.value_print(value) print print class ProfileBase(object): """ To properly parse register tables from the informally written 320066 datasheet, it is typically needed to perform some adjustments during the dispatching of text elements, and/or after the parse stage itself on abstracted tables. Also some tables need to be blacklisted, and the page range to be defined. Derive this class to implement such a register set profile, then register it by calling register_profile(derived_class) """ name = None # string - symbolic name of the register set min_page = None # int - first page to parse (included) max_page = None # int - last page to parse (included) @staticmethod def special_te_mapper(ps, te, table): """ This method can be used to help implement the .special_replace() method. ps: Parse State - instance of PageTE.ParseState te: Text Element - instance of SomeText table: replacement (hash) table, ex: { (432, TE_Id(text=u'N 00001b', top=384, left=409, width=60, height=8)): [TE_Id(text=u'N', top=384, left=409, width=4, height=8), TE_Id(text=u'00001b', top=384, left=427, width=50, height=8)], } The key is (page_num, te_id) where te_id describes the original text element which could not be fitted in a table cell. The value is a list of TE_Id, to replace the offending one. """ key = (ps.pte.page_num, te.identity()) repl_ids = table.get(key) if repl_ids is not None: return [SomeText.special_from_id(p) for p in repl_ids] def special_replace(self, ps, te): return None def post_fix(self, a_t): pass def table_blacklisted(self, chapter, table_num): """ chapter: int table_num: int """ return False _profiles = {} def register_profile(profile_class): if not isinstance(profile_class.name, basestring): raise ValueError("profile_class should be a string") _profiles[profile_class.name] = profile_class def profile_factory(profile_name): if profile_name not in _profiles: raise ValueError("profile %s does not exist" % profile_name) profile_class = _profiles[profile_name] return profile_class() # Unhandled Description UD = namedtuple('UD', 'te table_ref reg_name') ParsedTitle = namedtuple('ParsedTitle', 'chapter table_num offset reg_name title_desc') class PageTE(object): class ParseState(object): """ This contains attributes that are shared between PageTE methods during the parse stage, but really conceptually are local vars. I put them all here because it's easier to get ride of them all like that, and also less error prone. Plus they are clearly identified, so user code won't try to do stupid things and hopefully the mess in PageTE will be kept limited. """ NOP = State('NOP') TITLE = State('TITLE') HEADER = State('HEADER') COL_HEADER = State('COL_HEADER') CONTENT = State('CONTENT') def __init__(self, pte, et_page, profile): # tels: text elements # curt: current table # st: state (one of the above) # self.unhandled = {# no NOP self.TITLE: [], self.HEADER: [], self.COL_HEADER: [], self.CONTENT: []} self.pte = pte self.tels = [SomeText.from_et(elem) for elem in et_page.findall('text')] self.tels.sort(key=lambda elem: (elem.top, elem.left)) self.tels_X = [[] for i in xrange(self.pte.page_width)] self.tels_Y = [[] for i in xrange(self.pte.page_height)] for te in self.tels: self.tels_X[te.left].append(te) self.tels_Y[te.top].append(te) self.curt = None self.st = self.NOP self.title_y = None self.profile = profile def close_curt(self): if self.curt: self.curt.close() self.curt = None self.st = self.NOP def find_te_around(self, y, x, ay, ax): y_min = max(0, y - ay) y_max = min(self.pte.page_height, y + ay + 1) x_min = max(0, x - ax) x_max = min(self.pte.page_width, x + ax + 1) for tel_line in self.tels_Y[y_min:y_max]: for te in tel_line: if x_min <= te.left < x_max: return te WARN_EMPTY = False MIN_MATCH_COL = 0.9 # any text under this point marks and end of page PAGE_BOTTOM = 732 # gruik hack TABLE_CHAR_MAX_SIZE = 10 # quite gruik too start_search = re.compile(ur""" Table\s+(\d+)-(\d+)\s*\.\s* # - (?:Offset\s+([0-9a-f]+)h\s*:\s*)? # (\S+?(?:[[]\S+?[]])?)(?:\s?[-\u2013:])+ # (.*)$ # """, re.I | re.VERBOSE).search def parse_table_title(self, ps, text): m = self.start_search(text) if not m: return None chapter = int(m.group(1)) table_num = int(m.group(2)) offset_group = m.group(3) offset = (offset_group and int(offset_group, 16)) if ps.profile.table_blacklisted(chapter, table_num): return None return ParsedTitle( chapter=chapter, table_num=table_num, offset=offset, reg_name=m.group(4).encode('ascii', 'strict'), title_desc=m.group(5).encode('ascii', 'strict')) @staticmethod def PteTable(parsed_title): pt = Table(parsed_title.chapter, parsed_title.table_num, parsed_title.offset, parsed_title.reg_name, parsed_title.title_desc) pt.use_columns_pos() return pt # note: TEs are usually sorted by (y, x), but not strictly # in case of a special case. Don't do insane things. def _process_te(self, ps, te): if te.top >= self.PAGE_BOTTOM: return END_OF_PAGE parsed_title = self.parse_table_title(ps, te.text) if parsed_title: ps.close_curt() ps.curt = self.PteTable(parsed_title) self.tables.append(ps.curt) ps.title_y = te.top ps.st = ps.TITLE return INSERTED if (ps.st in (ps.TITLE, ps.HEADER)) and (te.text in Table.Fields): ps.st = ps.HEADER fd = Table.Fields[te.text] ote = ps.find_te_around(te.top, te.left + te.width, fd.adj_y, fd.adj_x) if ote: ote.handled = True ps.curt.set_field_parse(fd, ote.text) else: if self.WARN_EMPTY: print ("WARNING: page %d: empty field %r in " "Table %s Offset %s: %s - %s") % \ (self.page_num, te.text, ps.curt.table_ref, format_offset(ps.curt.offset), ps.curt.reg_name, ps.curt.title_desc) return INSERTED if ps.st == ps.TITLE: if te.top < ps.title_y + ps.curt.START_LINE_MAX_HEIGHT: ps.curt.cat_start_line(te.text) ps.title_y = te.top return INSERTED # else nothing if ps.st in (ps.HEADER, ps.COL_HEADER): r = ps.curt.columns.try_column_header(te, # info for error traces: {'page': self.page_num, 'table': ps.curt.table_ref}) if r: ps.st = ps.COL_HEADER return INSERTED elif ps.st == ps.COL_HEADER: ps.curt.start_content() ps.st = ps.CONTENT if DBG: print "RAW", ps.curt.table_ref, ps.curt.reg_name, ps.curt.columns.ver_pos, dbg_fmt_hor_pos(ps.curt.columns.raw_hor_pos) print "POS", ps.curt.table_ref, ps.curt.reg_name, ps.curt.columns.ver_pos, dbg_fmt_hor_pos(ps.curt.columns.hor_pos) if ps.st == ps.CONTENT: if te.height > self.TABLE_CHAR_MAX_SIZE: ps.close_curt() return INSERTED lvl = [None] column = ps.curt.columns.search(te.left, te.left + te.width, self.MIN_MATCH_COL, lvl) if DBG: print "III", ps.curt.table_ref, ps.curt.reg_name, lvl[0], te if column is not None: # XXX log dismissed one that do not close the table? # need to add a special return value to insert_content_te # to do that if not ps.curt.insert_content_te(column, te): ps.close_curt() return INSERTED elif not te.special: spe_tels = self.try_special(ps, te) if spe_tels is not None: if not spe_tels: return INSERTED result = NOT_HANDLED for spe_te in spe_tels: spe_r = self.process_te(ps, spe_te) if spe_r == END_OF_PAGE: return END_OF_PAGE elif spe_r == INSERTED: result = INSERTED if result == INSERTED: return INSERTED if ps.st in ps.unhandled: ps.unhandled[ps.st].append(UD(te, ps.curt.table_ref, ps.curt.reg_name)) return NOT_HANDLED def process_te(self, ps, te): """ factors out handling of te.handled from _process_te """ if te.handled: return INSERTED r = self._process_te(ps, te) if r != NOT_HANDLED: te.handled = True return r def try_special(self, ps, te): """ Returns None if this te is not to be handled as a special case. Else returns a list of te for which insertion should be tried. IMPORTANT: It is important that each of these te have its 'special' attribute set to True so that infinite recursion is reliably avoided. (special te won't result in a call to try_special if they are not handled) """ return ps.profile.special_replace(ps, te) def __init__(self, et_page, profile): self.page_num = int(et_page.attrib['number']) if DBG: print "PAGE", self.page_num self.page_height = int(et_page.attrib['height']) self.page_width = int(et_page.attrib['width']) self.tables = [] ps = self.ParseState(self, et_page, profile) for te in ps.tels: if self.process_te(ps, te) == END_OF_PAGE: break ps.close_curt() for state, unh_tels in ps.unhandled.iteritems(): for ud in unh_tels: if not ud.te.handled: print "WWW unhandled %r ### %r" % \ ((self.page_num, ud.te.identity()), (ud.table_ref, ud.reg_name, state)) class AbsTables(object): def __init__(self): self.tables = [] self.by_ref = {} self.by_base_name = {} def append_table(self, t): if t.table_ref not in self.by_ref: if t.reg_base_name in self.by_base_name: raise ValueError("already have a table (ref: %s) " "for register %s" % (self.by_base_name[t.reg_base_name].table_ref, t.reg_base_name)) new_abs_t = t.clone_abstract() self.tables.append(new_abs_t) self.by_ref[t.table_ref] = new_abs_t self.by_base_name[t.reg_base_name] = new_abs_t else: ml = self.by_ref[t.table_ref].header_mismatch(t) if ml: print "MISMATCH %s:" % t.table_ref, ml self.by_ref[t.table_ref].extend_abstract(t) def find_by_base_name(self, base_name): return self.by_base_name.get(base_name) def find_by_offset(self, offset): for t in self.tables: if offset == t.offset: return t if t.recurring and (offset > t.offset): delta = offset - t.offset if (delta % t.offset_start[1]) == 0 \ and (delta / t.offset_start[1]) < t.recurring: return t # for use in post_fix def auto_tables_offset(self): for t in self.tables: t.auto_offset() def abs_tables_from_pages(pages, profile): """ Simple wrapper to concatenate and analyse tables from loaded pages. """ a_t = AbsTables() for p in pages: pte = PageTE(p, profile) for t in pte.tables: a_t.append_table(t) for t in a_t.tables: t.analyse_content() profile.post_fix(a_t) return a_t