512 lines
20 KiB
Python
512 lines
20 KiB
Python
import html
|
||
import re
|
||
import warnings
|
||
from dataclasses import dataclass
|
||
from typing import Literal, Optional, Union, cast
|
||
|
||
import lxml.etree
|
||
from lxml import etree as ET
|
||
|
||
from formex_viewer.main import Language
|
||
|
||
|
||
def text_content(el: ET._Element) -> str:
|
||
"""Get the text content of an XML element, including all child elements."""
|
||
|
||
def _iterate(el):
|
||
for child in el.iter():
|
||
if child.text:
|
||
yield child.text
|
||
if child.tail:
|
||
yield child.tail
|
||
if el.text:
|
||
yield el.text
|
||
if el.tail:
|
||
yield el.tail
|
||
|
||
return "".join(_iterate(el))
|
||
|
||
|
||
@dataclass
|
||
class CrossReference:
|
||
target: Literal["article", "annex"]
|
||
text: str
|
||
id: str
|
||
paragraph: int | None = None
|
||
|
||
|
||
def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
|
||
"""Extract cross-references from an XML element.
|
||
|
||
Args:
|
||
el: The XML element to extract cross-references from.
|
||
|
||
Returns:
|
||
A dictionary with cross-reference IDs as keys and their text content as values.
|
||
"""
|
||
crossrefs = []
|
||
text = text_content(el)
|
||
|
||
PATTERN_PARTS = {
|
||
Language.ENG: {
|
||
"article": r"(Art\.|Articles?)",
|
||
"annex": r"(Ann\.|Annex)",
|
||
"exclusion": r"(?! of(?! this))",
|
||
},
|
||
Language.DEU: {
|
||
"article": r"(Art\.|Artikels?)",
|
||
"annex": r"(Anhang)",
|
||
"exclusion": r"(?! von)",
|
||
},
|
||
}
|
||
|
||
if language not in PATTERN_PARTS:
|
||
warnings.warn(
|
||
f"Language '{language}' not supported for cross-reference extraction. Returning empty list."
|
||
)
|
||
return []
|
||
|
||
# Prevent zealous matching of references to other texts by using a negative lookahead
|
||
# Also, match only at word boundaries to prevent partial matches
|
||
parts = PATTERN_PARTS[language]
|
||
patterns = {
|
||
"article": rf"\b{parts["article"]}\s+(?P<art_num>\d+)(?:[(](?P<parag_num>\d+)[)])?(?:{parts["exclusion"]})",
|
||
"annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+)(?:{parts["exclusion"]})",
|
||
}
|
||
for key, pattern in patterns.items():
|
||
matches = re.finditer(pattern, text, flags=re.IGNORECASE)
|
||
for match in matches:
|
||
crossref_id = (
|
||
match.group("art_num") if key == "article" else match.group("annex_num")
|
||
)
|
||
parag_num = match.groupdict().get("parag_num")
|
||
|
||
if not parag_num or key not in ["article", "annex"]:
|
||
raise RuntimeError()
|
||
|
||
crossref_text = match.group(0)
|
||
crossrefs.append(
|
||
CrossReference(
|
||
target=key,
|
||
id=crossref_id,
|
||
paragraph=int(parag_num),
|
||
text=crossref_text,
|
||
)
|
||
)
|
||
return crossrefs
|
||
|
||
|
||
def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
|
||
"""Extract a specific article from a Formex document.
|
||
|
||
Args:
|
||
doc: The XML document to extract from.
|
||
article_id: The article number.
|
||
|
||
Returns:
|
||
The extracted article element.
|
||
"""
|
||
|
||
# Use XPath to find the specific article
|
||
xpath = f".//ARTICLE[@IDENTIFIER='{article_id:03d}']"
|
||
return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
|
||
|
||
|
||
def extract_paragraph(
|
||
doc: ET._Element, article_id: int, paragraph_id: int
|
||
) -> ET._Element | None:
|
||
"""Extract a specific paragraph from an article in a Formex document.
|
||
|
||
Args:
|
||
doc: The XML document to extract from.
|
||
article_id: The article number.
|
||
paragraph_id: The paragraph number.
|
||
|
||
Returns:
|
||
The extracted paragraph element.
|
||
"""
|
||
|
||
# Use XPath to find the specific paragraph
|
||
xpath = f".//PARAG[@IDENTIFIER='{article_id:03d}.{paragraph_id:03d}']"
|
||
return doc.xpath(xpath)[0] if doc.xpath(xpath) else None
|
||
|
||
|
||
class FormexArticleConverter:
|
||
"""Converts Formex XML <ARTICLE> elements to semantic HTML5."""
|
||
|
||
def __init__(self, language: Language, namespace: Optional[str] = None):
|
||
"""
|
||
Initialize the converter.
|
||
|
||
Args:
|
||
language: Language object to determine the language for cross-reference extraction
|
||
namespace: Optional XML namespace to use when parsing elements
|
||
"""
|
||
self.ns = namespace
|
||
self.language = language
|
||
self.ns_prefix = f"{{{namespace}}}" if namespace else ""
|
||
|
||
def _get_tag(self, tag: str) -> str:
|
||
"""Get the tag name with namespace if available."""
|
||
return f"{self.ns_prefix}{tag}"
|
||
|
||
def _get_text(self, element: ET._Element) -> str:
|
||
"""Get the text content of an element, including all nested text.
|
||
|
||
This uses lxml's text_content() method when available, falling back to
|
||
manual traversal for other cases.
|
||
"""
|
||
if element is None:
|
||
return ""
|
||
|
||
# Use lxml's built-in text_content() method which is more efficient
|
||
try:
|
||
return element.text_content()
|
||
except AttributeError:
|
||
# Fall back to manual traversal if text_content() is not available
|
||
text = element.text or ""
|
||
for child in element.iterchildren(tag="*"):
|
||
text += self._get_text(child)
|
||
if child.tail:
|
||
text += child.tail
|
||
return text
|
||
|
||
def _create_id(self, identifier: str) -> str:
|
||
"""Create a valid HTML ID from the article identifier."""
|
||
# Clean and normalize the identifier for use as an HTML id
|
||
clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
|
||
return f"art-{clean_id}"
|
||
|
||
def _replace_xref(self, text: str, xref: CrossReference) -> str:
|
||
"""Replace a cross-reference instance with semantic markup in the text."""
|
||
# Replace the cross-reference text with a link
|
||
text = text.replace(
|
||
xref.text,
|
||
f'<a href="" data-target="{xref.target}" data-id="{xref.id}" data-paragraph-id="{xref.paragraph or ''}" class="cross-ref">{xref.text}</a>',
|
||
)
|
||
return text
|
||
|
||
def _convert_btx(self, element: ET._Element) -> str:
|
||
"""
|
||
Convert basic text elements (t_btx, t_btx.seq) to HTML.
|
||
|
||
This is a simplified implementation. In a complete version,
|
||
this would handle all the possible child elements defined in t_btx.
|
||
"""
|
||
if element is None:
|
||
return ""
|
||
|
||
result = element.text or ""
|
||
|
||
is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART")
|
||
if not is_title and not element.getchildren():
|
||
# Cross-references should be treated at the deepest level
|
||
xrefs = extract_xrefs(element, self.language)
|
||
for xref in xrefs:
|
||
# Replace the cross-reference text with a link
|
||
result = self._replace_xref(result, xref)
|
||
|
||
for child in element.iterchildren(tag="*"):
|
||
child_tag = child.tag.replace(self.ns_prefix, "")
|
||
|
||
# Process common inline elements
|
||
if child_tag == "HT":
|
||
# Handle highlighted text with appropriate HTML tags
|
||
ht_type = child.get("TYPE", "NORMAL")
|
||
if ht_type == "BOLD":
|
||
result += f"<strong>{self._convert_btx(child)}</strong>"
|
||
elif ht_type == "ITALIC":
|
||
result += f"<em>{self._convert_btx(child)}</em>"
|
||
elif ht_type == "SUB":
|
||
result += f"<sub>{self._convert_btx(child)}</sub>"
|
||
elif ht_type == "SUP":
|
||
result += f"<sup>{self._convert_btx(child)}</sup>"
|
||
elif ht_type == "UNDERLINE":
|
||
result += f"<u>{self._convert_btx(child)}</u>"
|
||
elif ht_type == "SC": # Small caps
|
||
result += f'<span style="font-variant: small-caps">{self._convert_btx(child)}</span>'
|
||
else:
|
||
result += self._convert_btx(child)
|
||
elif child_tag == "FT":
|
||
# Format text (numbers, codes, etc.)
|
||
ft_type = child.get("TYPE", "")
|
||
if ft_type == "NUMBER" or ft_type == "DECIMAL":
|
||
result += (
|
||
f'<span class="ft-number">{self._convert_btx(child)}</span>'
|
||
)
|
||
elif ft_type == "CODE":
|
||
result += f"<code>{self._convert_btx(child)}</code>"
|
||
else:
|
||
result += f'<span class="ft-{ft_type.lower()}">{self._convert_btx(child)}</span>'
|
||
elif child_tag == "IE":
|
||
# Inclusion/exclusion marker
|
||
result += '<span class="ie-marker">±</span>'
|
||
elif child_tag == "BR":
|
||
# Line break
|
||
result += "<br>"
|
||
elif child_tag == "P":
|
||
# Paragraph
|
||
result += f"<p>{self._convert_btx(child)}</p>"
|
||
elif child_tag == "NOTE":
|
||
# Note reference
|
||
note_id = child.get("NOTE.ID", "")
|
||
result += f'<sup class="note-ref" id="{note_id}">{self._convert_btx(child)}</sup>'
|
||
elif child_tag == "QUOT.START":
|
||
# Opening quotation mark
|
||
result += "“"
|
||
elif child_tag == "QUOT.END":
|
||
# Closing quotation mark
|
||
result += "”"
|
||
elif child_tag == "LIST":
|
||
# Handle lists
|
||
|
||
# Formex styles to CSS list-style-type mapping
|
||
list_style_map = {
|
||
"ARAB": "decimal",
|
||
"ALPHA": "upper-alpha",
|
||
"alpha": "lower-alpha",
|
||
"ROMAN": "upper-roman",
|
||
"roman": "lower-roman",
|
||
"BULLET": "disc",
|
||
"DASH": "'—'",
|
||
"NDASH:": "'–'",
|
||
"NONE": "none",
|
||
"OTHER": "none",
|
||
}
|
||
|
||
list_type = child.get("TYPE", "BULLET")
|
||
list_style_type = list_style_map[list_type]
|
||
if list_type == "BULLET":
|
||
result += f"<ul>{self._convert_list(child)}</ul>"
|
||
elif list_type in ["ARAB", "ALPHA", "alpha", "ROMAN", "roman"]:
|
||
result += f"<ol class='list-{list_style_type}'>{self._convert_list(child)}</ol>"
|
||
else:
|
||
result += f"<ul class='list-{list_style_type}'>{self._convert_list(child)}</ul>"
|
||
elif child_tag == "TXT":
|
||
# Simple text element
|
||
result += html.escape(self._get_text(child))
|
||
elif child_tag == "LINK":
|
||
# Handle links (added for lxml version)
|
||
uri = child.get("URI", "#")
|
||
result += f'<a href="{uri}">{self._convert_btx(child)}</a>'
|
||
elif child_tag == "REF.DOC.OJ":
|
||
# Handle references to OJ documents
|
||
coll = child.get("COLL", "")
|
||
no_oj = child.get("NO.OJ", "")
|
||
date = child.get("DATE.PUB", "")
|
||
page = child.get("PAGE.FIRST", "")
|
||
result += (
|
||
f'<span class="ref-oj">{coll} {no_oj}, {date}, p. {page}</span>'
|
||
)
|
||
else:
|
||
# Recursively process other element types
|
||
result += self._convert_btx(child)
|
||
|
||
if child.tail:
|
||
xrefs = extract_xrefs(child, self.language)
|
||
tail_text = child.tail
|
||
for xref in xrefs:
|
||
# Replace the cross-reference text with a link
|
||
tail_text = self._replace_xref(tail_text, xref)
|
||
|
||
result += tail_text
|
||
|
||
return result
|
||
|
||
def _convert_list(self, list_element: ET._Element) -> str:
|
||
"""Convert a Formex LIST element to HTML list items."""
|
||
result = ""
|
||
# Using lxml's xpath to get direct child ITEM elements
|
||
for item in list_element.xpath(f"./{self._get_tag('ITEM')}"):
|
||
item_content = ""
|
||
# Process ITEM contents which should be either NP or P elements
|
||
for child in item:
|
||
child_tag = child.tag.replace(self.ns_prefix, "")
|
||
if child_tag == "NP":
|
||
# Numbered paragraph - extract the number and text
|
||
no_p_elems = child.xpath(f"./{self._get_tag('NO.P')}")
|
||
txt_elems = child.xpath(f"./{self._get_tag('TXT')}")
|
||
|
||
no_p = no_p_elems[0] if no_p_elems else None
|
||
txt = txt_elems[0] if txt_elems else None
|
||
|
||
if no_p is not None and txt is not None:
|
||
num = self._get_text(no_p)
|
||
text = self._get_text(txt)
|
||
|
||
# Handle cross-references within the text
|
||
xrefs = extract_xrefs(txt, self.language)
|
||
for xref in xrefs:
|
||
text = self._replace_xref(text, xref)
|
||
|
||
item_content += f'<span class="item-number">{num}</span> {text}'
|
||
elif child_tag == "P":
|
||
# Regular paragraph
|
||
item_content += self._convert_btx(child)
|
||
else:
|
||
# Other elements
|
||
item_content += self._convert_btx(child)
|
||
|
||
result += f"<li>{item_content}</li>"
|
||
|
||
return result
|
||
|
||
def _convert_alinea(self, alinea: ET._Element) -> str:
|
||
"""Convert an ALINEA element to HTML."""
|
||
return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
|
||
|
||
def _convert_parag(self, parag: ET._Element) -> str:
|
||
"""Convert a PARAG (paragraph) element to HTML."""
|
||
identifier = parag.get("IDENTIFIER", "")
|
||
parag_id = self._create_id(identifier) if identifier else ""
|
||
|
||
# Get the paragraph number using XPath
|
||
no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}")
|
||
parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else ""
|
||
|
||
# Process the alineas within the paragraph
|
||
content = ""
|
||
for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"):
|
||
content += self._convert_alinea(alinea)
|
||
|
||
# Process any comments
|
||
for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"):
|
||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
||
|
||
# Process any quotations
|
||
for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"):
|
||
content += (
|
||
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
||
)
|
||
|
||
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
|
||
|
||
def _convert_subdiv(self, subdiv: ET._Element) -> str:
|
||
"""Convert a SUBDIV (subdivision) element to HTML."""
|
||
# Get the title using XPath
|
||
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
|
||
title = ""
|
||
if title_elems:
|
||
title_elem = title_elems[0]
|
||
# Process TI (title) and STI (subtitle) elements
|
||
ti_elems = title_elem.xpath(f"./{self._get_tag('TI')}")
|
||
ti_text = self._convert_btx(ti_elems[0]) if ti_elems else ""
|
||
|
||
sti_list = []
|
||
for sti in title_elem.xpath(f"./{self._get_tag('STI')}"):
|
||
sti_list.append(self._convert_btx(sti))
|
||
|
||
title = f'<h4 class="subdivision-title">{ti_text}</h4>'
|
||
if sti_list:
|
||
title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'
|
||
|
||
# Process content: either paragraphs, alineas, or nested subdivisions
|
||
content = ""
|
||
|
||
# Process paragraphs directly under this subdivision
|
||
for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"):
|
||
content += self._convert_parag(parag)
|
||
|
||
# Process alineas directly under this subdivision
|
||
for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"):
|
||
content += self._convert_alinea(alinea)
|
||
|
||
# Process comments directly under this subdivision
|
||
for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"):
|
||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
||
|
||
# Process quotations directly under this subdivision
|
||
for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"):
|
||
content += (
|
||
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
||
)
|
||
|
||
# Process nested subdivisions directly under this subdivision
|
||
for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"):
|
||
content += self._convert_subdiv(sub)
|
||
|
||
return f'<section class="subdivision">{title}{content}</section>'
|
||
|
||
def convert_article(self, article: Union[str, ET._Element]) -> str:
|
||
"""
|
||
Convert a Formex <ARTICLE> element to HTML5.
|
||
|
||
Args:
|
||
article: Either an lxml Element or an XML string representing an ARTICLE
|
||
|
||
Returns:
|
||
A string containing the HTML5 representation of the article
|
||
"""
|
||
# Parse the article if it's a string
|
||
if isinstance(article, str):
|
||
try:
|
||
parser = ET.XMLParser(remove_blank_text=True)
|
||
article = cast(
|
||
ET._Element, ET.fromstring(article.encode("utf-8"), parser)
|
||
)
|
||
except ET.XMLSyntaxError as e:
|
||
return f"<p>Error parsing XML: {e}</p>"
|
||
|
||
# Extract the article identifier
|
||
identifier = article.get("IDENTIFIER", "")
|
||
article_id = self._create_id(identifier)
|
||
|
||
# Strip processing instructions
|
||
ET.strip_tags(article, lxml.etree.PI)
|
||
|
||
# Extract the article title
|
||
# Use lxml's xpath capabilities for better namespace handling
|
||
ti_art = article.xpath(f".//{self._get_tag('TI.ART')}")
|
||
ti_art = ti_art[0] if ti_art else None
|
||
article_title = self._convert_btx(ti_art) if ti_art is not None else ""
|
||
|
||
# Extract the article subtitle if present
|
||
sti_art = article.xpath(f".//{self._get_tag('STI.ART')}")
|
||
sti_art = sti_art[0] if sti_art else None
|
||
article_subtitle = self._convert_btx(sti_art) if sti_art is not None else ""
|
||
|
||
# Build the header section
|
||
if article_title and article_subtitle:
|
||
header = f'<header><h3 class="article-title">{article_title}</h3>'
|
||
if article_subtitle:
|
||
header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
|
||
header += "</header>"
|
||
else:
|
||
header = ""
|
||
|
||
# Process the content based on what's present
|
||
content = ""
|
||
|
||
# Check if we have alineas directly under the article
|
||
alineas = article.xpath(f"./{self._get_tag('ALINEA')}")
|
||
if alineas:
|
||
for alinea in alineas:
|
||
content += self._convert_alinea(alinea)
|
||
|
||
# Check if we have paragraphs directly under the article
|
||
parags = article.xpath(f"./{self._get_tag('PARAG')}")
|
||
if parags:
|
||
for parag in parags:
|
||
content += self._convert_parag(parag)
|
||
|
||
# Check for comments directly under the article
|
||
comments = article.xpath(f"./{self._get_tag('COMMENT')}")
|
||
if comments:
|
||
for comment in comments:
|
||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
||
|
||
# Check for quotations directly under the article
|
||
quots = article.xpath(f"./{self._get_tag('QUOT.S')}")
|
||
if quots:
|
||
for quot in quots:
|
||
content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
||
|
||
# Check for subdivisions directly under the article
|
||
subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}")
|
||
if subdivs:
|
||
for subdiv in subdivs:
|
||
content += self._convert_subdiv(subdiv)
|
||
|
||
# Assemble the complete article
|
||
return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'
|