fix: Type hints in Formex parser
This commit is contained in:
@@ -2,7 +2,7 @@ import html
|
||||
import re
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, Optional, Union
|
||||
from typing import Literal, Optional, Union, cast
|
||||
|
||||
import lxml.etree
|
||||
from lxml import etree as ET
|
||||
@@ -10,7 +10,7 @@ from lxml import etree as ET
|
||||
from formex_viewer.main import Language
|
||||
|
||||
|
||||
def text_content(el: lxml.etree.Element) -> str:
|
||||
def text_content(el: ET._Element) -> str:
|
||||
"""Get the text content of an XML element, including all child elements."""
|
||||
|
||||
def _iterate(el):
|
||||
@@ -35,7 +35,7 @@ class CrossReference:
|
||||
paragraph: int | None = None
|
||||
|
||||
|
||||
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
|
||||
def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
|
||||
"""Extract cross-references from an XML element.
|
||||
|
||||
Args:
|
||||
@@ -80,19 +80,23 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
|
||||
match.group("art_num") if key == "article" else match.group("annex_num")
|
||||
)
|
||||
parag_num = match.groupdict().get("parag_num")
|
||||
|
||||
if not parag_num or key not in ["article", "annex"]:
|
||||
raise RuntimeError()
|
||||
|
||||
crossref_text = match.group(0)
|
||||
crossrefs.append(
|
||||
CrossReference(
|
||||
target=key,
|
||||
id=crossref_id,
|
||||
paragraph=parag_num,
|
||||
paragraph=int(parag_num),
|
||||
text=crossref_text,
|
||||
)
|
||||
)
|
||||
return crossrefs
|
||||
|
||||
|
||||
def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None:
|
||||
def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
|
||||
"""Extract a specific article from a Formex document.
|
||||
|
||||
Args:
|
||||
@@ -109,8 +113,8 @@ def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | No
|
||||
|
||||
|
||||
def extract_paragraph(
|
||||
doc: ET.ElementBase, article_id: int, paragraph_id: int
|
||||
) -> ET.ElementBase | None:
|
||||
doc: ET._Element, article_id: int, paragraph_id: int
|
||||
) -> ET._Element | None:
|
||||
"""Extract a specific paragraph from an article in a Formex document.
|
||||
|
||||
Args:
|
||||
@@ -146,7 +150,7 @@ class FormexArticleConverter:
|
||||
"""Get the tag name with namespace if available."""
|
||||
return f"{self.ns_prefix}{tag}"
|
||||
|
||||
def _get_text(self, element: ET.Element) -> str:
|
||||
def _get_text(self, element: ET._Element) -> str:
|
||||
"""Get the text content of an element, including all nested text.
|
||||
|
||||
This uses lxml's text_content() method when available, falling back to
|
||||
@@ -161,7 +165,7 @@ class FormexArticleConverter:
|
||||
except AttributeError:
|
||||
# Fall back to manual traversal if text_content() is not available
|
||||
text = element.text or ""
|
||||
for child in element:
|
||||
for child in element.iterchildren(tag="*"):
|
||||
text += self._get_text(child)
|
||||
if child.tail:
|
||||
text += child.tail
|
||||
@@ -182,7 +186,7 @@ class FormexArticleConverter:
|
||||
)
|
||||
return text
|
||||
|
||||
def _convert_btx(self, element: ET.Element) -> str:
|
||||
def _convert_btx(self, element: ET._Element) -> str:
|
||||
"""
|
||||
Convert basic text elements (t_btx, t_btx.seq) to HTML.
|
||||
|
||||
@@ -202,7 +206,7 @@ class FormexArticleConverter:
|
||||
# Replace the cross-reference text with a link
|
||||
result = self._replace_xref(result, xref)
|
||||
|
||||
for child in element:
|
||||
for child in element.iterchildren(tag="*"):
|
||||
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||
|
||||
# Process common inline elements
|
||||
@@ -309,7 +313,7 @@ class FormexArticleConverter:
|
||||
|
||||
return result
|
||||
|
||||
def _convert_list(self, list_element: ET.Element) -> str:
|
||||
def _convert_list(self, list_element: ET._Element) -> str:
|
||||
"""Convert a Formex LIST element to HTML list items."""
|
||||
result = ""
|
||||
# Using lxml's xpath to get direct child ITEM elements
|
||||
@@ -347,11 +351,11 @@ class FormexArticleConverter:
|
||||
|
||||
return result
|
||||
|
||||
def _convert_alinea(self, alinea: ET.Element) -> str:
|
||||
def _convert_alinea(self, alinea: ET._Element) -> str:
|
||||
"""Convert an ALINEA element to HTML."""
|
||||
return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
|
||||
|
||||
def _convert_parag(self, parag: ET.Element) -> str:
|
||||
def _convert_parag(self, parag: ET._Element) -> str:
|
||||
"""Convert a PARAG (paragraph) element to HTML."""
|
||||
identifier = parag.get("IDENTIFIER", "")
|
||||
parag_id = self._create_id(identifier) if identifier else ""
|
||||
@@ -377,7 +381,7 @@ class FormexArticleConverter:
|
||||
|
||||
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
|
||||
|
||||
def _convert_subdiv(self, subdiv: ET.Element) -> str:
|
||||
def _convert_subdiv(self, subdiv: ET._Element) -> str:
|
||||
"""Convert a SUBDIV (subdivision) element to HTML."""
|
||||
# Get the title using XPath
|
||||
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
|
||||
@@ -423,7 +427,7 @@ class FormexArticleConverter:
|
||||
|
||||
return f'<section class="subdivision">{title}{content}</section>'
|
||||
|
||||
def convert_article(self, article: Union[str, ET.Element]) -> str:
|
||||
def convert_article(self, article: Union[str, ET._Element]) -> str:
|
||||
"""
|
||||
Convert a Formex <ARTICLE> element to HTML5.
|
||||
|
||||
@@ -437,7 +441,9 @@ class FormexArticleConverter:
|
||||
if isinstance(article, str):
|
||||
try:
|
||||
parser = ET.XMLParser(remove_blank_text=True)
|
||||
article = ET.fromstring(article.encode("utf-8"), parser)
|
||||
article = cast(
|
||||
ET._Element, ET.fromstring(article.encode("utf-8"), parser)
|
||||
)
|
||||
except ET.XMLSyntaxError as e:
|
||||
return f"<p>Error parsing XML: {e}</p>"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user