fix: Type hints in Formex parser

This commit is contained in:
Adrian Rumpold
2025-05-20 08:37:16 +02:00
parent 1d467c827a
commit 56b5e3e3a4

View File

@@ -2,7 +2,7 @@ import html
import re
import warnings
from dataclasses import dataclass
from typing import Literal, Optional, Union
from typing import Literal, Optional, Union, cast
import lxml.etree
from lxml import etree as ET
@@ -10,7 +10,7 @@ from lxml import etree as ET
from formex_viewer.main import Language
def text_content(el: lxml.etree.Element) -> str:
def text_content(el: ET._Element) -> str:
"""Get the text content of an XML element, including all child elements."""
def _iterate(el):
@@ -35,7 +35,7 @@ class CrossReference:
paragraph: int | None = None
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
"""Extract cross-references from an XML element.
Args:
@@ -80,19 +80,23 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
match.group("art_num") if key == "article" else match.group("annex_num")
)
parag_num = match.groupdict().get("parag_num")
if not parag_num or key not in ["article", "annex"]:
raise RuntimeError()
crossref_text = match.group(0)
crossrefs.append(
CrossReference(
target=key,
id=crossref_id,
paragraph=parag_num,
paragraph=int(parag_num),
text=crossref_text,
)
)
return crossrefs
def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None:
def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
"""Extract a specific article from a Formex document.
Args:
@@ -109,8 +113,8 @@ def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | No
def extract_paragraph(
doc: ET.ElementBase, article_id: int, paragraph_id: int
) -> ET.ElementBase | None:
doc: ET._Element, article_id: int, paragraph_id: int
) -> ET._Element | None:
"""Extract a specific paragraph from an article in a Formex document.
Args:
@@ -146,7 +150,7 @@ class FormexArticleConverter:
"""Get the tag name with namespace if available."""
return f"{self.ns_prefix}{tag}"
def _get_text(self, element: ET.Element) -> str:
def _get_text(self, element: ET._Element) -> str:
"""Get the text content of an element, including all nested text.
This uses lxml's text_content() method when available, falling back to
@@ -161,7 +165,7 @@ class FormexArticleConverter:
except AttributeError:
# Fall back to manual traversal if text_content() is not available
text = element.text or ""
for child in element:
for child in element.iterchildren(tag="*"):
text += self._get_text(child)
if child.tail:
text += child.tail
@@ -182,7 +186,7 @@ class FormexArticleConverter:
)
return text
def _convert_btx(self, element: ET.Element) -> str:
def _convert_btx(self, element: ET._Element) -> str:
"""
Convert basic text elements (t_btx, t_btx.seq) to HTML.
@@ -202,7 +206,7 @@ class FormexArticleConverter:
# Replace the cross-reference text with a link
result = self._replace_xref(result, xref)
for child in element:
for child in element.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
# Process common inline elements
@@ -309,7 +313,7 @@ class FormexArticleConverter:
return result
def _convert_list(self, list_element: ET.Element) -> str:
def _convert_list(self, list_element: ET._Element) -> str:
"""Convert a Formex LIST element to HTML list items."""
result = ""
# Using lxml's xpath to get direct child ITEM elements
@@ -347,11 +351,11 @@ class FormexArticleConverter:
return result
def _convert_alinea(self, alinea: ET.Element) -> str:
def _convert_alinea(self, alinea: ET._Element) -> str:
"""Convert an ALINEA element to HTML."""
return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
def _convert_parag(self, parag: ET.Element) -> str:
def _convert_parag(self, parag: ET._Element) -> str:
"""Convert a PARAG (paragraph) element to HTML."""
identifier = parag.get("IDENTIFIER", "")
parag_id = self._create_id(identifier) if identifier else ""
@@ -377,7 +381,7 @@ class FormexArticleConverter:
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
def _convert_subdiv(self, subdiv: ET.Element) -> str:
def _convert_subdiv(self, subdiv: ET._Element) -> str:
"""Convert a SUBDIV (subdivision) element to HTML."""
# Get the title using XPath
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
@@ -423,7 +427,7 @@ class FormexArticleConverter:
return f'<section class="subdivision">{title}{content}</section>'
def convert_article(self, article: Union[str, ET.Element]) -> str:
def convert_article(self, article: Union[str, ET._Element]) -> str:
"""
Convert a Formex <ARTICLE> element to HTML5.
@@ -437,7 +441,9 @@ class FormexArticleConverter:
if isinstance(article, str):
try:
parser = ET.XMLParser(remove_blank_text=True)
article = ET.fromstring(article.encode("utf-8"), parser)
article = cast(
ET._Element, ET.fromstring(article.encode("utf-8"), parser)
)
except ET.XMLSyntaxError as e:
return f"<p>Error parsing XML: {e}</p>"