Files
formex-viewer/src/formex_viewer/formex4.py
2025-05-20 09:05:32 +02:00

497 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import html
import re
import warnings
from dataclasses import dataclass
from typing import Literal, Optional, Union, cast
import lxml.etree
from lxml import etree as ET
from formex_viewer.main import Language
def text_content(el: ET._Element) -> str:
"""Get the text content of an XML element, including all child elements."""
def _iterate(el):
for child in el.iter():
if child.text:
yield child.text
if child.tail:
yield child.tail
if el.text:
yield el.text
if el.tail:
yield el.tail
return "".join(_iterate(el))
@dataclass
class CrossReference:
target: Literal["article", "annex"]
text: str
id: str
paragraph: int | None = None
def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
"""Extract cross-references from an XML element.
Args:
el: The XML element to extract cross-references from.
Returns:
A dictionary with cross-reference IDs as keys and their text content as values.
"""
crossrefs = []
text = text_content(el)
PATTERN_PARTS = {
Language.ENG: {
"article": r"(Art\.|Articles?)",
"annex": r"(Ann\.|Annex)",
"exclusion": r"(?! of(?! this))",
},
Language.DEU: {
"article": r"(Art\.|Artikels?)",
"annex": r"(Anhang)",
"exclusion": r"(?! von)",
},
}
if language not in PATTERN_PARTS:
warnings.warn(
f"Language '{language}' not supported for cross-reference extraction. Returning empty list."
)
return []
# Prevent zealous matching of references to other texts by using a negative lookahead
# Also, match only at word boundaries to prevent partial matches
parts = PATTERN_PARTS[language]
patterns = {
"article": rf"\b{parts["article"]}\s+(?P<art_num>\d+)(?:[(](?P<parag_num>\d+)[)])?(?:{parts["exclusion"]})",
"annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+)(?:{parts["exclusion"]})",
}
for key, pattern in patterns.items():
matches = re.finditer(pattern, text, flags=re.IGNORECASE)
for match in matches:
crossref_id = (
match.group("art_num") if key == "article" else match.group("annex_num")
)
parag_num = match.groupdict().get("parag_num")
if key not in ["article", "annex"]:
raise RuntimeError()
crossref_text = match.group(0)
crossrefs.append(
CrossReference(
target=key,
id=crossref_id,
paragraph=int(parag_num) if parag_num else None,
text=crossref_text,
)
)
return crossrefs
def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
"""Extract a specific article from a Formex document.
Args:
doc: The XML document to extract from.
article_id: The article number.
Returns:
The extracted article element.
"""
# Use XPath to find the specific article
xpath = f".//ARTICLE[@IDENTIFIER='{article_id:03d}']"
return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
def extract_paragraph(
doc: ET._Element, article_id: int, paragraph_id: int
) -> ET._Element | None:
"""Extract a specific paragraph from an article in a Formex document.
Args:
doc: The XML document to extract from.
article_id: The article number.
paragraph_id: The paragraph number.
Returns:
The extracted paragraph element.
"""
# Use XPath to find the specific paragraph
xpath = f".//PARAG[@IDENTIFIER='{article_id:03d}.{paragraph_id:03d}']"
return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
class FormexArticleConverter:
"""Converts Formex XML <ARTICLE> elements to semantic HTML5."""
def __init__(self, language: Language, namespace: Optional[str] = None):
"""
Initialize the converter.
Args:
language: Language object to determine the language for cross-reference extraction
namespace: Optional XML namespace to use when parsing elements
"""
self.ns = namespace
self.language = language
self.ns_prefix = f"{{{namespace}}}" if namespace else ""
def _get_tag(self, tag: str) -> str:
"""Get the tag name with namespace if available."""
return f"{self.ns_prefix}{tag}"
def _get_text(self, element: ET._Element) -> str:
"""Get the text content of an element, including all nested text.
This uses lxml's text_content() method when available, falling back to
manual traversal for other cases.
"""
if element is None:
return ""
# Use lxml's built-in text_content() method which is more efficient
try:
return element.text_content()
except AttributeError:
# Fall back to manual traversal if text_content() is not available
text = element.text or ""
for child in element.iterchildren(tag="*"):
text += self._get_text(child)
if child.tail:
text += child.tail
return text
def _create_id(self, identifier: str) -> str:
"""Create a valid HTML ID from the article identifier."""
# Clean and normalize the identifier for use as an HTML id
clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
return f"art-{clean_id}"
def _replace_xref(self, text: str, xref: CrossReference) -> str:
"""Replace a cross-reference instance with semantic markup in the text."""
# Replace the cross-reference text with a link
text = text.replace(
xref.text,
f'<a href="" data-target="{xref.target}" data-id="{xref.id}" data-paragraph-id="{xref.paragraph or ''}" class="cross-ref">{xref.text}</a>',
)
return text
def _convert_btx(self, element: ET._Element) -> str:
"""
Convert basic text elements (t_btx, t_btx.seq) to HTML.
This is a simplified implementation. In a complete version,
this would handle all the possible child elements defined in t_btx.
"""
if element is None:
return ""
result = element.text or ""
is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART")
if not is_title and not element.getchildren():
# Cross-references should be treated at the deepest level
xrefs = extract_xrefs(element, self.language)
for xref in xrefs:
# Replace the cross-reference text with a link
result = self._replace_xref(result, xref)
for child in element.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
# Process common inline elements
if child_tag == "HT":
# Handle highlighted text with appropriate HTML tags
ht_type = child.get("TYPE", "NORMAL")
if ht_type == "BOLD":
result += f"<strong>{self._convert_btx(child)}</strong>"
elif ht_type == "ITALIC":
result += f"<em>{self._convert_btx(child)}</em>"
elif ht_type == "SUB":
result += f"<sub>{self._convert_btx(child)}</sub>"
elif ht_type == "SUP":
result += f"<sup>{self._convert_btx(child)}</sup>"
elif ht_type == "UNDERLINE":
result += f"<u>{self._convert_btx(child)}</u>"
elif ht_type == "SC": # Small caps
result += f'<span style="font-variant: small-caps">{self._convert_btx(child)}</span>'
else:
result += self._convert_btx(child)
elif child_tag == "FT":
# Format text (numbers, codes, etc.)
ft_type = child.get("TYPE", "")
if ft_type == "NUMBER" or ft_type == "DECIMAL":
result += (
f'<span class="ft-number">{self._convert_btx(child)}</span>'
)
elif ft_type == "CODE":
result += f"<code>{self._convert_btx(child)}</code>"
else:
result += f'<span class="ft-{ft_type.lower()}">{self._convert_btx(child)}</span>'
elif child_tag == "IE":
# Inclusion/exclusion marker
result += '<span class="ie-marker">±</span>'
elif child_tag == "BR":
# Line break
result += "<br>"
elif child_tag == "P":
# Paragraph
result += f"<p>{self._convert_btx(child)}</p>"
elif child_tag == "NOTE":
# Note reference
note_id = child.get("NOTE.ID", "")
result += f'<sup class="note-ref" id="{note_id}">{self._convert_btx(child)}</sup>'
elif child_tag == "QUOT.START":
# Opening quotation mark
result += "&ldquo;"
elif child_tag == "QUOT.END":
# Closing quotation mark
result += "&rdquo;"
elif child_tag == "LIST":
# Handle lists
# Formex styles to CSS list-style-type mapping
list_style_map = {
"ARAB": "decimal",
"ALPHA": "upper-alpha",
"alpha": "lower-alpha",
"ROMAN": "upper-roman",
"roman": "lower-roman",
"BULLET": "disc",
"DASH": "''",
"NDASH:": "''",
"NONE": "none",
"OTHER": "none",
}
list_type = child.get("TYPE", "BULLET")
list_style_type = list_style_map[list_type]
if list_type == "BULLET":
result += f"<ul>{self._convert_list(child)}</ul>"
elif list_type in ["ARAB", "ALPHA", "alpha", "ROMAN", "roman"]:
result += f"<ol class='list-{list_style_type}'>{self._convert_list(child)}</ol>"
else:
result += f"<ul class='list-{list_style_type}'>{self._convert_list(child)}</ul>"
elif child_tag == "TXT":
# Simple text element
result += html.escape(self._get_text(child))
elif child_tag == "LINK":
# Handle links (added for lxml version)
uri = child.get("URI", "#")
result += f'<a href="{uri}">{self._convert_btx(child)}</a>'
elif child_tag == "REF.DOC.OJ":
# Handle references to OJ documents
coll = child.get("COLL", "")
no_oj = child.get("NO.OJ", "")
date = child.get("DATE.PUB", "")
page = child.get("PAGE.FIRST", "")
result += (
f'<span class="ref-oj">{coll} {no_oj}, {date}, p. {page}</span>'
)
else:
# Recursively process other element types
result += self._convert_btx(child)
if child.tail:
xrefs = extract_xrefs(child, self.language)
tail_text = child.tail
for xref in xrefs:
# Replace the cross-reference text with a link
tail_text = self._replace_xref(tail_text, xref)
result += tail_text
return result
def _convert_list(self, list_element: ET._Element) -> str:
"""Convert a Formex LIST element to HTML list items."""
result = ""
# Using lxml's xpath to get direct child ITEM elements
for item in list_element.xpath(f"./{self._get_tag('ITEM')}"):
item_content = ""
# Process ITEM contents which should be either NP or P elements
for child in item:
child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag == "NP":
# Numbered paragraph - extract the number and text
no_p_elems = child.xpath(f"./{self._get_tag('NO.P')}")
txt_elems = child.xpath(f"./{self._get_tag('TXT')}")
no_p = no_p_elems[0] if no_p_elems else None
txt = txt_elems[0] if txt_elems else None
if no_p is not None and txt is not None:
num = self._get_text(no_p)
text = self._get_text(txt)
# Handle cross-references within the text
xrefs = extract_xrefs(txt, self.language)
for xref in xrefs:
text = self._replace_xref(text, xref)
item_content += f'<span class="item-number">{num}</span> {text}'
elif child_tag == "P":
# Regular paragraph
item_content += self._convert_btx(child)
else:
# Other elements
item_content += self._convert_btx(child)
result += f"<li>{item_content}</li>"
return result
def _convert_alinea(self, alinea: ET._Element) -> str:
"""Convert an ALINEA element to HTML."""
return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
def _convert_parag(self, parag: ET._Element) -> str:
"""Convert a PARAG (paragraph) element to HTML."""
identifier = parag.get("IDENTIFIER", "")
parag_id = self._create_id(identifier) if identifier else ""
content = ""
for child in parag.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag == "ALINEA":
content += self._convert_alinea(child)
elif child_tag == "COMMENT":
content += f'<div class="comment">{self._convert_btx(child)}</div>'
elif child_tag == "QUOT.S":
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
elif child_tag == "NO.PARAG":
content += (
f'<span class="paragraph-number">{self._convert_btx(child)}</span>'
)
else:
raise RuntimeError(
f"Unexpected child element '{child_tag}' in PARAG: {text_content(child)}"
)
return f'<div class="paragraph" data-paragraph-id="{parag_id}">{content}</div>'
def _convert_subdiv(self, subdiv: ET._Element) -> str:
"""Convert a SUBDIV (subdivision) element to HTML, preserving child order."""
# Get the title using XPath (should be the first TITLE child if present)
title = ""
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
if title_elems:
title_elem = title_elems[0]
# Process TI (title) and STI (subtitle) elements
ti_elems = title_elem.xpath(f"./{self._get_tag('TI')}")
ti_text = self._convert_btx(ti_elems[0]) if ti_elems else ""
sti_list = []
for sti in title_elem.xpath(f"./{self._get_tag('STI')}"):
sti_list.append(self._convert_btx(sti))
title = f'<h4 class="subdivision-title">{ti_text}</h4>'
if sti_list:
title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'
# Process all children in order, skipping TITLE (already handled)
content = ""
for child in subdiv.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag == "TITLE":
continue # already handled
elif child_tag == "PARAG":
content += self._convert_parag(child)
elif child_tag == "ALINEA":
content += self._convert_alinea(child)
elif child_tag == "COMMENT":
content += f'<div class="comment">{self._convert_btx(child)}</div>'
elif child_tag == "QUOT.S":
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
elif child_tag == "SUBDIV":
content += self._convert_subdiv(child)
else:
raise RuntimeError(
f"Unexpected child element '{child_tag}' in SUBDIV: {text_content(child)}"
)
return f'<section class="subdivision">{title}{content}</section>'
def convert_article(self, article: Union[str, ET._Element]) -> str:
"""
Convert a Formex <ARTICLE> element to HTML5.
Args:
article: Either an lxml Element or an XML string representing an ARTICLE
Returns:
A string containing the HTML5 representation of the article
"""
# Parse the article if it's a string
if isinstance(article, str):
try:
parser = ET.XMLParser(remove_blank_text=True)
article = cast(
ET._Element, ET.fromstring(article.encode("utf-8"), parser)
)
except ET.XMLSyntaxError as e:
return f"<p>Error parsing XML: {e}</p>"
# Extract the article identifier
identifier = article.get("IDENTIFIER", "")
article_id = self._create_id(identifier)
# Strip processing instructions
ET.strip_tags(article, lxml.etree.PI)
# Extract the article title
# Use lxml's xpath capabilities for better namespace handling
ti_art = article.xpath(f".//{self._get_tag('TI.ART')}")
ti_art = ti_art[0] if ti_art else None
article_title = self._convert_btx(ti_art) if ti_art is not None else ""
# Extract the article subtitle if present
sti_art = article.xpath(f".//{self._get_tag('STI.ART')}")
sti_art = sti_art[0] if sti_art else None
article_subtitle = self._convert_btx(sti_art) if sti_art is not None else ""
# Build the header section
if article_title and article_subtitle:
header = f'<header><h3 class="article-title">{article_title}</h3>'
if article_subtitle:
header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
header += "</header>"
else:
header = ""
# Process the content based on what's present
content = ""
# Process all child elements (except TITLE) in tree order
for child in article.iterchildren(tag="*"):
child_tag = child.tag.replace(self.ns_prefix, "")
if child_tag in ["TI.ART", "STI.ART"]:
continue # already handled
elif child_tag == "ALINEA":
content += self._convert_alinea(child)
elif child_tag == "PARAG":
content += self._convert_parag(child)
elif child_tag == "COMMENT":
content += f'<div class="comment">{self._convert_btx(child)}</div>'
elif child_tag == "QUOT.S":
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
elif child_tag == "SUBDIV":
content += self._convert_subdiv(child)
else:
raise RuntimeError(
f"Unexpected child element '{child_tag}' in ARTICLE: {text_content(child)}"
)
# Assemble the complete article
return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'