fix: Type hints in Formex parser
This commit is contained in:
@@ -2,7 +2,7 @@ import html
|
|||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal, Optional, Union
|
from typing import Literal, Optional, Union, cast
|
||||||
|
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
@@ -10,7 +10,7 @@ from lxml import etree as ET
|
|||||||
from formex_viewer.main import Language
|
from formex_viewer.main import Language
|
||||||
|
|
||||||
|
|
||||||
def text_content(el: lxml.etree.Element) -> str:
|
def text_content(el: ET._Element) -> str:
|
||||||
"""Get the text content of an XML element, including all child elements."""
|
"""Get the text content of an XML element, including all child elements."""
|
||||||
|
|
||||||
def _iterate(el):
|
def _iterate(el):
|
||||||
@@ -35,7 +35,7 @@ class CrossReference:
|
|||||||
paragraph: int | None = None
|
paragraph: int | None = None
|
||||||
|
|
||||||
|
|
||||||
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
|
def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
|
||||||
"""Extract cross-references from an XML element.
|
"""Extract cross-references from an XML element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -80,19 +80,23 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
|
|||||||
match.group("art_num") if key == "article" else match.group("annex_num")
|
match.group("art_num") if key == "article" else match.group("annex_num")
|
||||||
)
|
)
|
||||||
parag_num = match.groupdict().get("parag_num")
|
parag_num = match.groupdict().get("parag_num")
|
||||||
|
|
||||||
|
if not parag_num or key not in ["article", "annex"]:
|
||||||
|
raise RuntimeError()
|
||||||
|
|
||||||
crossref_text = match.group(0)
|
crossref_text = match.group(0)
|
||||||
crossrefs.append(
|
crossrefs.append(
|
||||||
CrossReference(
|
CrossReference(
|
||||||
target=key,
|
target=key,
|
||||||
id=crossref_id,
|
id=crossref_id,
|
||||||
paragraph=parag_num,
|
paragraph=int(parag_num),
|
||||||
text=crossref_text,
|
text=crossref_text,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return crossrefs
|
return crossrefs
|
||||||
|
|
||||||
|
|
||||||
def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None:
|
def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
|
||||||
"""Extract a specific article from a Formex document.
|
"""Extract a specific article from a Formex document.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -109,8 +113,8 @@ def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | No
|
|||||||
|
|
||||||
|
|
||||||
def extract_paragraph(
|
def extract_paragraph(
|
||||||
doc: ET.ElementBase, article_id: int, paragraph_id: int
|
doc: ET._Element, article_id: int, paragraph_id: int
|
||||||
) -> ET.ElementBase | None:
|
) -> ET._Element | None:
|
||||||
"""Extract a specific paragraph from an article in a Formex document.
|
"""Extract a specific paragraph from an article in a Formex document.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -146,7 +150,7 @@ class FormexArticleConverter:
|
|||||||
"""Get the tag name with namespace if available."""
|
"""Get the tag name with namespace if available."""
|
||||||
return f"{self.ns_prefix}{tag}"
|
return f"{self.ns_prefix}{tag}"
|
||||||
|
|
||||||
def _get_text(self, element: ET.Element) -> str:
|
def _get_text(self, element: ET._Element) -> str:
|
||||||
"""Get the text content of an element, including all nested text.
|
"""Get the text content of an element, including all nested text.
|
||||||
|
|
||||||
This uses lxml's text_content() method when available, falling back to
|
This uses lxml's text_content() method when available, falling back to
|
||||||
@@ -161,7 +165,7 @@ class FormexArticleConverter:
|
|||||||
except AttributeError:
|
except AttributeError:
|
||||||
# Fall back to manual traversal if text_content() is not available
|
# Fall back to manual traversal if text_content() is not available
|
||||||
text = element.text or ""
|
text = element.text or ""
|
||||||
for child in element:
|
for child in element.iterchildren(tag="*"):
|
||||||
text += self._get_text(child)
|
text += self._get_text(child)
|
||||||
if child.tail:
|
if child.tail:
|
||||||
text += child.tail
|
text += child.tail
|
||||||
@@ -182,7 +186,7 @@ class FormexArticleConverter:
|
|||||||
)
|
)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _convert_btx(self, element: ET.Element) -> str:
|
def _convert_btx(self, element: ET._Element) -> str:
|
||||||
"""
|
"""
|
||||||
Convert basic text elements (t_btx, t_btx.seq) to HTML.
|
Convert basic text elements (t_btx, t_btx.seq) to HTML.
|
||||||
|
|
||||||
@@ -202,7 +206,7 @@ class FormexArticleConverter:
|
|||||||
# Replace the cross-reference text with a link
|
# Replace the cross-reference text with a link
|
||||||
result = self._replace_xref(result, xref)
|
result = self._replace_xref(result, xref)
|
||||||
|
|
||||||
for child in element:
|
for child in element.iterchildren(tag="*"):
|
||||||
child_tag = child.tag.replace(self.ns_prefix, "")
|
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||||
|
|
||||||
# Process common inline elements
|
# Process common inline elements
|
||||||
@@ -309,7 +313,7 @@ class FormexArticleConverter:
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _convert_list(self, list_element: ET.Element) -> str:
|
def _convert_list(self, list_element: ET._Element) -> str:
|
||||||
"""Convert a Formex LIST element to HTML list items."""
|
"""Convert a Formex LIST element to HTML list items."""
|
||||||
result = ""
|
result = ""
|
||||||
# Using lxml's xpath to get direct child ITEM elements
|
# Using lxml's xpath to get direct child ITEM elements
|
||||||
@@ -347,11 +351,11 @@ class FormexArticleConverter:
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _convert_alinea(self, alinea: ET.Element) -> str:
|
def _convert_alinea(self, alinea: ET._Element) -> str:
|
||||||
"""Convert an ALINEA element to HTML."""
|
"""Convert an ALINEA element to HTML."""
|
||||||
return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
|
return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
|
||||||
|
|
||||||
def _convert_parag(self, parag: ET.Element) -> str:
|
def _convert_parag(self, parag: ET._Element) -> str:
|
||||||
"""Convert a PARAG (paragraph) element to HTML."""
|
"""Convert a PARAG (paragraph) element to HTML."""
|
||||||
identifier = parag.get("IDENTIFIER", "")
|
identifier = parag.get("IDENTIFIER", "")
|
||||||
parag_id = self._create_id(identifier) if identifier else ""
|
parag_id = self._create_id(identifier) if identifier else ""
|
||||||
@@ -377,7 +381,7 @@ class FormexArticleConverter:
|
|||||||
|
|
||||||
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
|
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
|
||||||
|
|
||||||
def _convert_subdiv(self, subdiv: ET.Element) -> str:
|
def _convert_subdiv(self, subdiv: ET._Element) -> str:
|
||||||
"""Convert a SUBDIV (subdivision) element to HTML."""
|
"""Convert a SUBDIV (subdivision) element to HTML."""
|
||||||
# Get the title using XPath
|
# Get the title using XPath
|
||||||
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
|
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
|
||||||
@@ -423,7 +427,7 @@ class FormexArticleConverter:
|
|||||||
|
|
||||||
return f'<section class="subdivision">{title}{content}</section>'
|
return f'<section class="subdivision">{title}{content}</section>'
|
||||||
|
|
||||||
def convert_article(self, article: Union[str, ET.Element]) -> str:
|
def convert_article(self, article: Union[str, ET._Element]) -> str:
|
||||||
"""
|
"""
|
||||||
Convert a Formex <ARTICLE> element to HTML5.
|
Convert a Formex <ARTICLE> element to HTML5.
|
||||||
|
|
||||||
@@ -437,7 +441,9 @@ class FormexArticleConverter:
|
|||||||
if isinstance(article, str):
|
if isinstance(article, str):
|
||||||
try:
|
try:
|
||||||
parser = ET.XMLParser(remove_blank_text=True)
|
parser = ET.XMLParser(remove_blank_text=True)
|
||||||
article = ET.fromstring(article.encode("utf-8"), parser)
|
article = cast(
|
||||||
|
ET._Element, ET.fromstring(article.encode("utf-8"), parser)
|
||||||
|
)
|
||||||
except ET.XMLSyntaxError as e:
|
except ET.XMLSyntaxError as e:
|
||||||
return f"<p>Error parsing XML: {e}</p>"
|
return f"<p>Error parsing XML: {e}</p>"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user