fix: Type hints in Formex parser

2025-05-20 08:37:16 +02:00
parent 1d467c827a
commit 56b5e3e3a4
1 changed files with 23 additions and 17 deletions
@@ -2,7 +2,7 @@ import html
 import re
 import warnings
 from dataclasses import dataclass
-from typing import Literal, Optional, Union
+from typing import Literal, Optional, Union, cast

 import lxml.etree
 from lxml import etree as ET
@@ -10,7 +10,7 @@ from lxml import etree as ET
 from formex_viewer.main import Language


-def text_content(el: lxml.etree.Element) -> str:
+def text_content(el: ET._Element) -> str:
    """Get the text content of an XML element, including all child elements."""

    def _iterate(el):
@@ -35,7 +35,7 @@ class CrossReference:
    paragraph: int | None = None


-def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
+def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
    """Extract cross-references from an XML element.

    Args:
@@ -80,19 +80,23 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
                match.group("art_num") if key == "article" else match.group("annex_num")
            )
            parag_num = match.groupdict().get("parag_num")
+
+            if not parag_num or key not in ["article", "annex"]:
+                raise RuntimeError()
+
            crossref_text = match.group(0)
            crossrefs.append(
                CrossReference(
                    target=key,
                    id=crossref_id,
-                    paragraph=parag_num,
+                    paragraph=int(parag_num),
                    text=crossref_text,
                )
            )
    return crossrefs


-def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None:
+def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
    """Extract a specific article from a Formex document.

    Args:
@@ -109,8 +113,8 @@ def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | No


 def extract_paragraph(
-    doc: ET.ElementBase, article_id: int, paragraph_id: int
-) -> ET.ElementBase | None:
+    doc: ET._Element, article_id: int, paragraph_id: int
+) -> ET._Element | None:
    """Extract a specific paragraph from an article in a Formex document.

    Args:
@@ -146,7 +150,7 @@ class FormexArticleConverter:
        """Get the tag name with namespace if available."""
        return f"{self.ns_prefix}{tag}"

-    def _get_text(self, element: ET.Element) -> str:
+    def _get_text(self, element: ET._Element) -> str:
        """Get the text content of an element, including all nested text.

        This uses lxml's text_content() method when available, falling back to
@@ -161,7 +165,7 @@ class FormexArticleConverter:
        except AttributeError:
            # Fall back to manual traversal if text_content() is not available
            text = element.text or ""
-            for child in element:
+            for child in element.iterchildren(tag="*"):
                text += self._get_text(child)
                if child.tail:
                    text += child.tail
@@ -182,7 +186,7 @@ class FormexArticleConverter:
        )
        return text

-    def _convert_btx(self, element: ET.Element) -> str:
+    def _convert_btx(self, element: ET._Element) -> str:
        """
        Convert basic text elements (t_btx, t_btx.seq) to HTML.

@@ -202,7 +206,7 @@ class FormexArticleConverter:
                # Replace the cross-reference text with a link
                result = self._replace_xref(result, xref)

-        for child in element:
+        for child in element.iterchildren(tag="*"):
            child_tag = child.tag.replace(self.ns_prefix, "")

            # Process common inline elements
@@ -309,7 +313,7 @@ class FormexArticleConverter:

        return result

-    def _convert_list(self, list_element: ET.Element) -> str:
+    def _convert_list(self, list_element: ET._Element) -> str:
        """Convert a Formex LIST element to HTML list items."""
        result = ""
        # Using lxml's xpath to get direct child ITEM elements
@@ -347,11 +351,11 @@ class FormexArticleConverter:

        return result

-    def _convert_alinea(self, alinea: ET.Element) -> str:
+    def _convert_alinea(self, alinea: ET._Element) -> str:
        """Convert an ALINEA element to HTML."""
        return f'<p class="alinea">{self._convert_btx(alinea)}</p>'

-    def _convert_parag(self, parag: ET.Element) -> str:
+    def _convert_parag(self, parag: ET._Element) -> str:
        """Convert a PARAG (paragraph) element to HTML."""
        identifier = parag.get("IDENTIFIER", "")
        parag_id = self._create_id(identifier) if identifier else ""
@@ -377,7 +381,7 @@ class FormexArticleConverter:

        return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'

-    def _convert_subdiv(self, subdiv: ET.Element) -> str:
+    def _convert_subdiv(self, subdiv: ET._Element) -> str:
        """Convert a SUBDIV (subdivision) element to HTML."""
        # Get the title using XPath
        title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
@@ -423,7 +427,7 @@ class FormexArticleConverter:

        return f'<section class="subdivision">{title}{content}</section>'

-    def convert_article(self, article: Union[str, ET.Element]) -> str:
+    def convert_article(self, article: Union[str, ET._Element]) -> str:
        """
        Convert a Formex <ARTICLE> element to HTML5.

@@ -437,7 +441,9 @@ class FormexArticleConverter:
        if isinstance(article, str):
            try:
                parser = ET.XMLParser(remove_blank_text=True)
-                article = ET.fromstring(article.encode("utf-8"), parser)
+                article = cast(
+                    ET._Element, ET.fromstring(article.encode("utf-8"), parser)
+                )
            except ET.XMLSyntaxError as e:
                return f"<p>Error parsing XML: {e}</p>"