From 56b5e3e3a4478f1c5e7ac14eefd0c49d158ca309 Mon Sep 17 00:00:00 2001 From: Adrian Rumpold Date: Tue, 20 May 2025 08:37:16 +0200 Subject: [PATCH] fix: Type hints in Formex parser --- src/formex_viewer/formex4.py | 40 +++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/formex_viewer/formex4.py b/src/formex_viewer/formex4.py index 9df072e..334ca1f 100644 --- a/src/formex_viewer/formex4.py +++ b/src/formex_viewer/formex4.py @@ -2,7 +2,7 @@ import html import re import warnings from dataclasses import dataclass -from typing import Literal, Optional, Union +from typing import Literal, Optional, Union, cast import lxml.etree from lxml import etree as ET @@ -10,7 +10,7 @@ from lxml import etree as ET from formex_viewer.main import Language -def text_content(el: lxml.etree.Element) -> str: +def text_content(el: ET._Element) -> str: """Get the text content of an XML element, including all child elements.""" def _iterate(el): @@ -35,7 +35,7 @@ class CrossReference: paragraph: int | None = None -def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]: +def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]: """Extract cross-references from an XML element. Args: @@ -80,19 +80,23 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer match.group("art_num") if key == "article" else match.group("annex_num") ) parag_num = match.groupdict().get("parag_num") + + if not parag_num or key not in ["article", "annex"]: + raise RuntimeError() + crossref_text = match.group(0) crossrefs.append( CrossReference( target=key, id=crossref_id, - paragraph=parag_num, + paragraph=int(parag_num), text=crossref_text, ) ) return crossrefs -def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None: +def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None: """Extract a specific article from a Formex document. Args: @@ -109,8 +113,8 @@ def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | No def extract_paragraph( - doc: ET.ElementBase, article_id: int, paragraph_id: int -) -> ET.ElementBase | None: + doc: ET._Element, article_id: int, paragraph_id: int +) -> ET._Element | None: """Extract a specific paragraph from an article in a Formex document. Args: @@ -146,7 +150,7 @@ class FormexArticleConverter: """Get the tag name with namespace if available.""" return f"{self.ns_prefix}{tag}" - def _get_text(self, element: ET.Element) -> str: + def _get_text(self, element: ET._Element) -> str: """Get the text content of an element, including all nested text. This uses lxml's text_content() method when available, falling back to @@ -161,7 +165,7 @@ class FormexArticleConverter: except AttributeError: # Fall back to manual traversal if text_content() is not available text = element.text or "" - for child in element: + for child in element.iterchildren(tag="*"): text += self._get_text(child) if child.tail: text += child.tail @@ -182,7 +186,7 @@ class FormexArticleConverter: ) return text - def _convert_btx(self, element: ET.Element) -> str: + def _convert_btx(self, element: ET._Element) -> str: """ Convert basic text elements (t_btx, t_btx.seq) to HTML. @@ -202,7 +206,7 @@ class FormexArticleConverter: # Replace the cross-reference text with a link result = self._replace_xref(result, xref) - for child in element: + for child in element.iterchildren(tag="*"): child_tag = child.tag.replace(self.ns_prefix, "") # Process common inline elements @@ -309,7 +313,7 @@ class FormexArticleConverter: return result - def _convert_list(self, list_element: ET.Element) -> str: + def _convert_list(self, list_element: ET._Element) -> str: """Convert a Formex LIST element to HTML list items.""" result = "" # Using lxml's xpath to get direct child ITEM elements @@ -347,11 +351,11 @@ class FormexArticleConverter: return result - def _convert_alinea(self, alinea: ET.Element) -> str: + def _convert_alinea(self, alinea: ET._Element) -> str: """Convert an ALINEA element to HTML.""" return f'

{self._convert_btx(alinea)}

' - def _convert_parag(self, parag: ET.Element) -> str: + def _convert_parag(self, parag: ET._Element) -> str: """Convert a PARAG (paragraph) element to HTML.""" identifier = parag.get("IDENTIFIER", "") parag_id = self._create_id(identifier) if identifier else "" @@ -377,7 +381,7 @@ class FormexArticleConverter: return f'
{parag_num}{content}
' - def _convert_subdiv(self, subdiv: ET.Element) -> str: + def _convert_subdiv(self, subdiv: ET._Element) -> str: """Convert a SUBDIV (subdivision) element to HTML.""" # Get the title using XPath title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}") @@ -423,7 +427,7 @@ class FormexArticleConverter: return f'
{title}{content}
' - def convert_article(self, article: Union[str, ET.Element]) -> str: + def convert_article(self, article: Union[str, ET._Element]) -> str: """ Convert a Formex
element to HTML5. @@ -437,7 +441,9 @@ class FormexArticleConverter: if isinstance(article, str): try: parser = ET.XMLParser(remove_blank_text=True) - article = ET.fromstring(article.encode("utf-8"), parser) + article = cast( + ET._Element, ET.fromstring(article.encode("utf-8"), parser) + ) except ET.XMLSyntaxError as e: return f"

Error parsing XML: {e}

"