From 56b5e3e3a4478f1c5e7ac14eefd0c49d158ca309 Mon Sep 17 00:00:00 2001
From: Adrian Rumpold <a.rumpold@gmail.com>
Date: Tue, 20 May 2025 08:37:16 +0200
Subject: [PATCH] fix: Type hints in Formex parser

---
 src/formex_viewer/formex4.py | 40 +++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/src/formex_viewer/formex4.py b/src/formex_viewer/formex4.py
index 9df072e..334ca1f 100644
--- a/src/formex_viewer/formex4.py
+++ b/src/formex_viewer/formex4.py
@@ -2,7 +2,7 @@ import html
 import re
 import warnings
 from dataclasses import dataclass
-from typing import Literal, Optional, Union
+from typing import Literal, Optional, Union, cast
 
 import lxml.etree
 from lxml import etree as ET
@@ -10,7 +10,7 @@ from lxml import etree as ET
 from formex_viewer.main import Language
 
 
-def text_content(el: lxml.etree.Element) -> str:
+def text_content(el: ET._Element) -> str:
     """Get the text content of an XML element, including all child elements."""
 
     def _iterate(el):
@@ -35,7 +35,7 @@ class CrossReference:
     paragraph: int | None = None
 
 
-def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
+def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
     """Extract cross-references from an XML element.
 
     Args:
@@ -80,19 +80,23 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
                 match.group("art_num") if key == "article" else match.group("annex_num")
             )
             parag_num = match.groupdict().get("parag_num")
+
+            if not parag_num or key not in ["article", "annex"]:
+                raise RuntimeError()
+
             crossref_text = match.group(0)
             crossrefs.append(
                 CrossReference(
                     target=key,
                     id=crossref_id,
-                    paragraph=parag_num,
+                    paragraph=int(parag_num),
                     text=crossref_text,
                 )
             )
     return crossrefs
 
 
-def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None:
+def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
     """Extract a specific article from a Formex document.
 
     Args:
@@ -109,8 +113,8 @@ def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | No
 
 
 def extract_paragraph(
-    doc: ET.ElementBase, article_id: int, paragraph_id: int
-) -> ET.ElementBase | None:
+    doc: ET._Element, article_id: int, paragraph_id: int
+) -> ET._Element | None:
     """Extract a specific paragraph from an article in a Formex document.
 
     Args:
@@ -146,7 +150,7 @@ class FormexArticleConverter:
         """Get the tag name with namespace if available."""
         return f"{self.ns_prefix}{tag}"
 
-    def _get_text(self, element: ET.Element) -> str:
+    def _get_text(self, element: ET._Element) -> str:
         """Get the text content of an element, including all nested text.
 
         This uses lxml's text_content() method when available, falling back to
@@ -161,7 +165,7 @@ class FormexArticleConverter:
         except AttributeError:
             # Fall back to manual traversal if text_content() is not available
             text = element.text or ""
-            for child in element:
+            for child in element.iterchildren(tag="*"):
                 text += self._get_text(child)
                 if child.tail:
                     text += child.tail
@@ -182,7 +186,7 @@ class FormexArticleConverter:
         )
         return text
 
-    def _convert_btx(self, element: ET.Element) -> str:
+    def _convert_btx(self, element: ET._Element) -> str:
         """
         Convert basic text elements (t_btx, t_btx.seq) to HTML.
 
@@ -202,7 +206,7 @@ class FormexArticleConverter:
                 # Replace the cross-reference text with a link
                 result = self._replace_xref(result, xref)
 
-        for child in element:
+        for child in element.iterchildren(tag="*"):
             child_tag = child.tag.replace(self.ns_prefix, "")
 
             # Process common inline elements
@@ -309,7 +313,7 @@ class FormexArticleConverter:
 
         return result
 
-    def _convert_list(self, list_element: ET.Element) -> str:
+    def _convert_list(self, list_element: ET._Element) -> str:
         """Convert a Formex LIST element to HTML list items."""
         result = ""
         # Using lxml's xpath to get direct child ITEM elements
@@ -347,11 +351,11 @@ class FormexArticleConverter:
 
         return result
 
-    def _convert_alinea(self, alinea: ET.Element) -> str:
+    def _convert_alinea(self, alinea: ET._Element) -> str:
         """Convert an ALINEA element to HTML."""
         return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
 
-    def _convert_parag(self, parag: ET.Element) -> str:
+    def _convert_parag(self, parag: ET._Element) -> str:
         """Convert a PARAG (paragraph) element to HTML."""
         identifier = parag.get("IDENTIFIER", "")
         parag_id = self._create_id(identifier) if identifier else ""
@@ -377,7 +381,7 @@ class FormexArticleConverter:
 
         return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
 
-    def _convert_subdiv(self, subdiv: ET.Element) -> str:
+    def _convert_subdiv(self, subdiv: ET._Element) -> str:
         """Convert a SUBDIV (subdivision) element to HTML."""
         # Get the title using XPath
         title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
@@ -423,7 +427,7 @@ class FormexArticleConverter:
 
         return f'<section class="subdivision">{title}{content}</section>'
 
-    def convert_article(self, article: Union[str, ET.Element]) -> str:
+    def convert_article(self, article: Union[str, ET._Element]) -> str:
         """
         Convert a Formex <ARTICLE> element to HTML5.
 
@@ -437,7 +441,9 @@ class FormexArticleConverter:
         if isinstance(article, str):
             try:
                 parser = ET.XMLParser(remove_blank_text=True)
-                article = ET.fromstring(article.encode("utf-8"), parser)
+                article = cast(
+                    ET._Element, ET.fromstring(article.encode("utf-8"), parser)
+                )
             except ET.XMLSyntaxError as e:
                 return f"<p>Error parsing XML: {e}</p>"