fix: Type hints in Formex parser

This commit is contained in:
Adrian Rumpold
2025-05-20 08:37:16 +02:00
parent 1d467c827a
commit 56b5e3e3a4

View File

@@ -2,7 +2,7 @@ import html
import re import re
import warnings import warnings
from dataclasses import dataclass from dataclasses import dataclass
from typing import Literal, Optional, Union from typing import Literal, Optional, Union, cast
import lxml.etree import lxml.etree
from lxml import etree as ET from lxml import etree as ET
@@ -10,7 +10,7 @@ from lxml import etree as ET
from formex_viewer.main import Language from formex_viewer.main import Language
def text_content(el: lxml.etree.Element) -> str: def text_content(el: ET._Element) -> str:
"""Get the text content of an XML element, including all child elements.""" """Get the text content of an XML element, including all child elements."""
def _iterate(el): def _iterate(el):
@@ -35,7 +35,7 @@ class CrossReference:
paragraph: int | None = None paragraph: int | None = None
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]: def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
"""Extract cross-references from an XML element. """Extract cross-references from an XML element.
Args: Args:
@@ -80,19 +80,23 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
match.group("art_num") if key == "article" else match.group("annex_num") match.group("art_num") if key == "article" else match.group("annex_num")
) )
parag_num = match.groupdict().get("parag_num") parag_num = match.groupdict().get("parag_num")
if not parag_num or key not in ["article", "annex"]:
raise RuntimeError()
crossref_text = match.group(0) crossref_text = match.group(0)
crossrefs.append( crossrefs.append(
CrossReference( CrossReference(
target=key, target=key,
id=crossref_id, id=crossref_id,
paragraph=parag_num, paragraph=int(parag_num),
text=crossref_text, text=crossref_text,
) )
) )
return crossrefs return crossrefs
def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None: def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
"""Extract a specific article from a Formex document. """Extract a specific article from a Formex document.
Args: Args:
@@ -109,8 +113,8 @@ def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | No
def extract_paragraph( def extract_paragraph(
doc: ET.ElementBase, article_id: int, paragraph_id: int doc: ET._Element, article_id: int, paragraph_id: int
) -> ET.ElementBase | None: ) -> ET._Element | None:
"""Extract a specific paragraph from an article in a Formex document. """Extract a specific paragraph from an article in a Formex document.
Args: Args:
@@ -146,7 +150,7 @@ class FormexArticleConverter:
"""Get the tag name with namespace if available.""" """Get the tag name with namespace if available."""
return f"{self.ns_prefix}{tag}" return f"{self.ns_prefix}{tag}"
def _get_text(self, element: ET.Element) -> str: def _get_text(self, element: ET._Element) -> str:
"""Get the text content of an element, including all nested text. """Get the text content of an element, including all nested text.
This uses lxml's text_content() method when available, falling back to This uses lxml's text_content() method when available, falling back to
@@ -161,7 +165,7 @@ class FormexArticleConverter:
except AttributeError: except AttributeError:
# Fall back to manual traversal if text_content() is not available # Fall back to manual traversal if text_content() is not available
text = element.text or "" text = element.text or ""
for child in element: for child in element.iterchildren(tag="*"):
text += self._get_text(child) text += self._get_text(child)
if child.tail: if child.tail:
text += child.tail text += child.tail
@@ -182,7 +186,7 @@ class FormexArticleConverter:
) )
return text return text
def _convert_btx(self, element: ET.Element) -> str: def _convert_btx(self, element: ET._Element) -> str:
""" """
Convert basic text elements (t_btx, t_btx.seq) to HTML. Convert basic text elements (t_btx, t_btx.seq) to HTML.
@@ -202,7 +206,7 @@ class FormexArticleConverter:
# Replace the cross-reference text with a link # Replace the cross-reference text with a link
result = self._replace_xref(result, xref) result = self._replace_xref(result, xref)
for child in element: for child in element.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "") child_tag = child.tag.replace(self.ns_prefix, "")
# Process common inline elements # Process common inline elements
@@ -309,7 +313,7 @@ class FormexArticleConverter:
return result return result
def _convert_list(self, list_element: ET.Element) -> str: def _convert_list(self, list_element: ET._Element) -> str:
"""Convert a Formex LIST element to HTML list items.""" """Convert a Formex LIST element to HTML list items."""
result = "" result = ""
# Using lxml's xpath to get direct child ITEM elements # Using lxml's xpath to get direct child ITEM elements
@@ -347,11 +351,11 @@ class FormexArticleConverter:
return result return result
def _convert_alinea(self, alinea: ET.Element) -> str: def _convert_alinea(self, alinea: ET._Element) -> str:
"""Convert an ALINEA element to HTML.""" """Convert an ALINEA element to HTML."""
return f'<p class="alinea">{self._convert_btx(alinea)}</p>' return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
def _convert_parag(self, parag: ET.Element) -> str: def _convert_parag(self, parag: ET._Element) -> str:
"""Convert a PARAG (paragraph) element to HTML.""" """Convert a PARAG (paragraph) element to HTML."""
identifier = parag.get("IDENTIFIER", "") identifier = parag.get("IDENTIFIER", "")
parag_id = self._create_id(identifier) if identifier else "" parag_id = self._create_id(identifier) if identifier else ""
@@ -377,7 +381,7 @@ class FormexArticleConverter:
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>' return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
def _convert_subdiv(self, subdiv: ET.Element) -> str: def _convert_subdiv(self, subdiv: ET._Element) -> str:
"""Convert a SUBDIV (subdivision) element to HTML.""" """Convert a SUBDIV (subdivision) element to HTML."""
# Get the title using XPath # Get the title using XPath
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}") title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
@@ -423,7 +427,7 @@ class FormexArticleConverter:
return f'<section class="subdivision">{title}{content}</section>' return f'<section class="subdivision">{title}{content}</section>'
def convert_article(self, article: Union[str, ET.Element]) -> str: def convert_article(self, article: Union[str, ET._Element]) -> str:
""" """
Convert a Formex <ARTICLE> element to HTML5. Convert a Formex <ARTICLE> element to HTML5.
@@ -437,7 +441,9 @@ class FormexArticleConverter:
if isinstance(article, str): if isinstance(article, str):
try: try:
parser = ET.XMLParser(remove_blank_text=True) parser = ET.XMLParser(remove_blank_text=True)
article = ET.fromstring(article.encode("utf-8"), parser) article = cast(
ET._Element, ET.fromstring(article.encode("utf-8"), parser)
)
except ET.XMLSyntaxError as e: except ET.XMLSyntaxError as e:
return f"<p>Error parsing XML: {e}</p>" return f"<p>Error parsing XML: {e}</p>"