import html import re import warnings from dataclasses import dataclass from typing import Literal, Optional, Union, cast import lxml.etree from lxml import etree as ET from formex_viewer.main import Language def text_content(el: ET._Element) -> str: """Get the text content of an XML element, including all child elements.""" def _iterate(el): for child in el.iter(): if child.text: yield child.text if child.tail: yield child.tail if el.text: yield el.text if el.tail: yield el.tail return "".join(_iterate(el)) @dataclass class CrossReference: target: Literal["article", "annex"] text: str id: str paragraph: int | None = None def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]: """Extract cross-references from an XML element. Args: el: The XML element to extract cross-references from. Returns: A dictionary with cross-reference IDs as keys and their text content as values. """ crossrefs = [] text = text_content(el) PATTERN_PARTS = { Language.ENG: { "article": r"(Art\.|Articles?)", "annex": r"(Ann\.|Annex)", "exclusion": r"(?! of(?! this))", }, Language.DEU: { "article": r"(Art\.|Artikels?)", "annex": r"(Anhang)", "exclusion": r"(?! von)", }, } if language not in PATTERN_PARTS: warnings.warn( f"Language '{language}' not supported for cross-reference extraction. Returning empty list." ) return [] # Prevent zealous matching of references to other texts by using a negative lookahead # Also, match only at word boundaries to prevent partial matches parts = PATTERN_PARTS[language] patterns = { "article": rf"\b{parts["article"]}\s+(?P\d+)(?:[(](?P\d+)[)])?(?:{parts["exclusion"]})", "annex": rf"\b{parts["annex"]}\s+(?P[DILMVX]+)(?:{parts["exclusion"]})", } for key, pattern in patterns.items(): matches = re.finditer(pattern, text, flags=re.IGNORECASE) for match in matches: crossref_id = ( match.group("art_num") if key == "article" else match.group("annex_num") ) parag_num = match.groupdict().get("parag_num") if not parag_num or key not in ["article", "annex"]: raise RuntimeError() crossref_text = match.group(0) crossrefs.append( CrossReference( target=key, id=crossref_id, paragraph=int(parag_num), text=crossref_text, ) ) return crossrefs def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None: """Extract a specific article from a Formex document. Args: doc: The XML document to extract from. article_id: The article number. Returns: The extracted article element. """ # Use XPath to find the specific article xpath = f".//ARTICLE[@IDENTIFIER='{article_id:03d}']" return doc.xpath(xpath)[0] if doc.xpath(xpath) else None def extract_paragraph( doc: ET._Element, article_id: int, paragraph_id: int ) -> ET._Element | None: """Extract a specific paragraph from an article in a Formex document. Args: doc: The XML document to extract from. article_id: The article number. paragraph_id: The paragraph number. Returns: The extracted paragraph element. """ # Use XPath to find the specific paragraph xpath = f".//PARAG[@IDENTIFIER='{article_id:03d}.{paragraph_id:03d}']" return doc.xpath(xpath)[0] if doc.xpath(xpath) else None class FormexArticleConverter: """Converts Formex XML
elements to semantic HTML5.""" def __init__(self, language: Language, namespace: Optional[str] = None): """ Initialize the converter. Args: language: Language object to determine the language for cross-reference extraction namespace: Optional XML namespace to use when parsing elements """ self.ns = namespace self.language = language self.ns_prefix = f"{{{namespace}}}" if namespace else "" def _get_tag(self, tag: str) -> str: """Get the tag name with namespace if available.""" return f"{self.ns_prefix}{tag}" def _get_text(self, element: ET._Element) -> str: """Get the text content of an element, including all nested text. This uses lxml's text_content() method when available, falling back to manual traversal for other cases. """ if element is None: return "" # Use lxml's built-in text_content() method which is more efficient try: return element.text_content() except AttributeError: # Fall back to manual traversal if text_content() is not available text = element.text or "" for child in element.iterchildren(tag="*"): text += self._get_text(child) if child.tail: text += child.tail return text def _create_id(self, identifier: str) -> str: """Create a valid HTML ID from the article identifier.""" # Clean and normalize the identifier for use as an HTML id clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier) return f"art-{clean_id}" def _replace_xref(self, text: str, xref: CrossReference) -> str: """Replace a cross-reference instance with semantic markup in the text.""" # Replace the cross-reference text with a link text = text.replace( xref.text, f'{xref.text}', ) return text def _convert_btx(self, element: ET._Element) -> str: """ Convert basic text elements (t_btx, t_btx.seq) to HTML. This is a simplified implementation. In a complete version, this would handle all the possible child elements defined in t_btx. """ if element is None: return "" result = element.text or "" is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART") if not is_title and not element.getchildren(): # Cross-references should be treated at the deepest level xrefs = extract_xrefs(element, self.language) for xref in xrefs: # Replace the cross-reference text with a link result = self._replace_xref(result, xref) for child in element.iterchildren(tag="*"): child_tag = child.tag.replace(self.ns_prefix, "") # Process common inline elements if child_tag == "HT": # Handle highlighted text with appropriate HTML tags ht_type = child.get("TYPE", "NORMAL") if ht_type == "BOLD": result += f"{self._convert_btx(child)}" elif ht_type == "ITALIC": result += f"{self._convert_btx(child)}" elif ht_type == "SUB": result += f"{self._convert_btx(child)}" elif ht_type == "SUP": result += f"{self._convert_btx(child)}" elif ht_type == "UNDERLINE": result += f"{self._convert_btx(child)}" elif ht_type == "SC": # Small caps result += f'{self._convert_btx(child)}' else: result += self._convert_btx(child) elif child_tag == "FT": # Format text (numbers, codes, etc.) ft_type = child.get("TYPE", "") if ft_type == "NUMBER" or ft_type == "DECIMAL": result += ( f'{self._convert_btx(child)}' ) elif ft_type == "CODE": result += f"{self._convert_btx(child)}" else: result += f'{self._convert_btx(child)}' elif child_tag == "IE": # Inclusion/exclusion marker result += '±' elif child_tag == "BR": # Line break result += "
" elif child_tag == "P": # Paragraph result += f"

{self._convert_btx(child)}

" elif child_tag == "NOTE": # Note reference note_id = child.get("NOTE.ID", "") result += f'{self._convert_btx(child)}' elif child_tag == "QUOT.START": # Opening quotation mark result += "“" elif child_tag == "QUOT.END": # Closing quotation mark result += "”" elif child_tag == "LIST": # Handle lists # Formex styles to CSS list-style-type mapping list_style_map = { "ARAB": "decimal", "ALPHA": "upper-alpha", "alpha": "lower-alpha", "ROMAN": "upper-roman", "roman": "lower-roman", "BULLET": "disc", "DASH": "'—'", "NDASH:": "'–'", "NONE": "none", "OTHER": "none", } list_type = child.get("TYPE", "BULLET") list_style_type = list_style_map[list_type] if list_type == "BULLET": result += f"
    {self._convert_list(child)}
" elif list_type in ["ARAB", "ALPHA", "alpha", "ROMAN", "roman"]: result += f"
    {self._convert_list(child)}
" else: result += f"
    {self._convert_list(child)}
" elif child_tag == "TXT": # Simple text element result += html.escape(self._get_text(child)) elif child_tag == "LINK": # Handle links (added for lxml version) uri = child.get("URI", "#") result += f'{self._convert_btx(child)}' elif child_tag == "REF.DOC.OJ": # Handle references to OJ documents coll = child.get("COLL", "") no_oj = child.get("NO.OJ", "") date = child.get("DATE.PUB", "") page = child.get("PAGE.FIRST", "") result += ( f'{coll} {no_oj}, {date}, p. {page}' ) else: # Recursively process other element types result += self._convert_btx(child) if child.tail: xrefs = extract_xrefs(child, self.language) tail_text = child.tail for xref in xrefs: # Replace the cross-reference text with a link tail_text = self._replace_xref(tail_text, xref) result += tail_text return result def _convert_list(self, list_element: ET._Element) -> str: """Convert a Formex LIST element to HTML list items.""" result = "" # Using lxml's xpath to get direct child ITEM elements for item in list_element.xpath(f"./{self._get_tag('ITEM')}"): item_content = "" # Process ITEM contents which should be either NP or P elements for child in item: child_tag = child.tag.replace(self.ns_prefix, "") if child_tag == "NP": # Numbered paragraph - extract the number and text no_p_elems = child.xpath(f"./{self._get_tag('NO.P')}") txt_elems = child.xpath(f"./{self._get_tag('TXT')}") no_p = no_p_elems[0] if no_p_elems else None txt = txt_elems[0] if txt_elems else None if no_p is not None and txt is not None: num = self._get_text(no_p) text = self._get_text(txt) # Handle cross-references within the text xrefs = extract_xrefs(txt, self.language) for xref in xrefs: text = self._replace_xref(text, xref) item_content += f'{num} {text}' elif child_tag == "P": # Regular paragraph item_content += self._convert_btx(child) else: # Other elements item_content += self._convert_btx(child) result += f"
  • {item_content}
  • " return result def _convert_alinea(self, alinea: ET._Element) -> str: """Convert an ALINEA element to HTML.""" return f'

    {self._convert_btx(alinea)}

    ' def _convert_parag(self, parag: ET._Element) -> str: """Convert a PARAG (paragraph) element to HTML.""" identifier = parag.get("IDENTIFIER", "") parag_id = self._create_id(identifier) if identifier else "" # Get the paragraph number using XPath no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}") parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else "" # Process the alineas within the paragraph content = "" for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"): content += self._convert_alinea(alinea) # Process any comments for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"): content += f'
    {self._convert_btx(comment)}
    ' # Process any quotations for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"): content += ( f'
    {self._convert_btx(quot)}
    ' ) return f'
    {parag_num}{content}
    ' def _convert_subdiv(self, subdiv: ET._Element) -> str: """Convert a SUBDIV (subdivision) element to HTML.""" # Get the title using XPath title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}") title = "" if title_elems: title_elem = title_elems[0] # Process TI (title) and STI (subtitle) elements ti_elems = title_elem.xpath(f"./{self._get_tag('TI')}") ti_text = self._convert_btx(ti_elems[0]) if ti_elems else "" sti_list = [] for sti in title_elem.xpath(f"./{self._get_tag('STI')}"): sti_list.append(self._convert_btx(sti)) title = f'

    {ti_text}

    ' if sti_list: title += f'
    {" ".join(sti_list)}
    ' # Process content: either paragraphs, alineas, or nested subdivisions content = "" # Process paragraphs directly under this subdivision for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"): content += self._convert_parag(parag) # Process alineas directly under this subdivision for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"): content += self._convert_alinea(alinea) # Process comments directly under this subdivision for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"): content += f'
    {self._convert_btx(comment)}
    ' # Process quotations directly under this subdivision for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"): content += ( f'
    {self._convert_btx(quot)}
    ' ) # Process nested subdivisions directly under this subdivision for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"): content += self._convert_subdiv(sub) return f'
    {title}{content}
    ' def convert_article(self, article: Union[str, ET._Element]) -> str: """ Convert a Formex
    element to HTML5. Args: article: Either an lxml Element or an XML string representing an ARTICLE Returns: A string containing the HTML5 representation of the article """ # Parse the article if it's a string if isinstance(article, str): try: parser = ET.XMLParser(remove_blank_text=True) article = cast( ET._Element, ET.fromstring(article.encode("utf-8"), parser) ) except ET.XMLSyntaxError as e: return f"

    Error parsing XML: {e}

    " # Extract the article identifier identifier = article.get("IDENTIFIER", "") article_id = self._create_id(identifier) # Strip processing instructions ET.strip_tags(article, lxml.etree.PI) # Extract the article title # Use lxml's xpath capabilities for better namespace handling ti_art = article.xpath(f".//{self._get_tag('TI.ART')}") ti_art = ti_art[0] if ti_art else None article_title = self._convert_btx(ti_art) if ti_art is not None else "" # Extract the article subtitle if present sti_art = article.xpath(f".//{self._get_tag('STI.ART')}") sti_art = sti_art[0] if sti_art else None article_subtitle = self._convert_btx(sti_art) if sti_art is not None else "" # Build the header section if article_title and article_subtitle: header = f'

    {article_title}

    ' if article_subtitle: header += f'

    {article_subtitle}

    ' header += "
    " else: header = "" # Process the content based on what's present content = "" # Check if we have alineas directly under the article alineas = article.xpath(f"./{self._get_tag('ALINEA')}") if alineas: for alinea in alineas: content += self._convert_alinea(alinea) # Check if we have paragraphs directly under the article parags = article.xpath(f"./{self._get_tag('PARAG')}") if parags: for parag in parags: content += self._convert_parag(parag) # Check for comments directly under the article comments = article.xpath(f"./{self._get_tag('COMMENT')}") if comments: for comment in comments: content += f'
    {self._convert_btx(comment)}
    ' # Check for quotations directly under the article quots = article.xpath(f"./{self._get_tag('QUOT.S')}") if quots: for quot in quots: content += f'
    {self._convert_btx(quot)}
    ' # Check for subdivisions directly under the article subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}") if subdivs: for subdiv in subdivs: content += self._convert_subdiv(subdiv) # Assemble the complete article return f'
    {header}
    {content}
    '