Article cross-reference linking
This commit is contained in:
		| @@ -1,10 +1,14 @@ | ||||
| import html | ||||
| import re | ||||
| import warnings | ||||
| from dataclasses import dataclass | ||||
| from typing import Optional, Union | ||||
|  | ||||
| import lxml.etree | ||||
| from lxml import etree as ET | ||||
|  | ||||
| from formex_viewer.main import Language | ||||
|  | ||||
|  | ||||
| def text_content(el: lxml.etree.Element) -> str: | ||||
|     """Get the text content of an XML element, including all child elements.""" | ||||
| @@ -23,17 +27,79 @@ def text_content(el: lxml.etree.Element) -> str: | ||||
|     return "".join(_iterate(el)) | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class CrossReference: | ||||
|     id: str | ||||
|     text: str | ||||
|     target: str | ||||
|  | ||||
|  | ||||
| def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]: | ||||
|     """Extract cross-references from an XML element. | ||||
|  | ||||
|     Args: | ||||
|         el: The XML element to extract cross-references from. | ||||
|  | ||||
|     Returns: | ||||
|         A dictionary with cross-reference IDs as keys and their text content as values. | ||||
|     """ | ||||
|     crossrefs = [] | ||||
|     text = text_content(el) | ||||
|  | ||||
|     PATTERN_PARTS = { | ||||
|         Language.ENG: { | ||||
|             "article": r"(Art\.|Articles?)", | ||||
|             "annex": r"(Ann\.|Annex)", | ||||
|             "exclusion": r"(?! of(?! this))", | ||||
|         }, | ||||
|         Language.DEU: { | ||||
|             "article": r"(Art\.|Artikels?)", | ||||
|             "annex": r"(Anhang)", | ||||
|             "exclusion": r"(?! von)", | ||||
|         }, | ||||
|     } | ||||
|  | ||||
|     if language not in PATTERN_PARTS: | ||||
|         warnings.warn( | ||||
|             f"Language '{language}' not supported for cross-reference extraction. Returning empty list." | ||||
|         ) | ||||
|         return [] | ||||
|  | ||||
|     # Prevent zealous matching of references to other texts by using a negative lookahead | ||||
|     # Also, match only at word boundaries to prevent partial matches | ||||
|     parts = PATTERN_PARTS[language] | ||||
|     patterns = { | ||||
|         "article": rf"\b{parts["article"]}\s+(?P<art_num>\d+){parts["exclusion"]}\b", | ||||
|         "annex": rf"\b{parts["annex"]}\s+(?P<annex_num>[DILMVX]+){parts["exclusion"]}\b", | ||||
|     } | ||||
|     for key, pattern in patterns.items(): | ||||
|         matches = re.finditer(pattern, text, flags=re.IGNORECASE) | ||||
|         print(f"Pattern: {pattern}") | ||||
|         for match in matches: | ||||
|             print(f"Match: {match.group(0)}") | ||||
|             crossref_id = ( | ||||
|                 match.group("art_num") if key == "article" else match.group("annex_num") | ||||
|             ) | ||||
|             crossref_text = match.group(0) | ||||
|             crossrefs.append( | ||||
|                 CrossReference(id=crossref_id, text=crossref_text, target=key) | ||||
|             ) | ||||
|     return crossrefs | ||||
|  | ||||
|  | ||||
| class FormexArticleConverter: | ||||
|     """Converts Formex XML <ARTICLE> elements to semantic HTML5.""" | ||||
|  | ||||
|     def __init__(self, namespace: Optional[str] = None): | ||||
|     def __init__(self, language: Language, namespace: Optional[str] = None): | ||||
|         """ | ||||
|         Initialize the converter. | ||||
|  | ||||
|         Args: | ||||
|             language: Language object to determine the language for cross-reference extraction | ||||
|             namespace: Optional XML namespace to use when parsing elements | ||||
|         """ | ||||
|         self.ns = namespace | ||||
|         self.language = language | ||||
|         self.ns_prefix = f"{{{namespace}}}" if namespace else "" | ||||
|  | ||||
|     def _get_tag(self, tag: str) -> str: | ||||
| @@ -67,6 +133,15 @@ class FormexArticleConverter: | ||||
|         clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier) | ||||
|         return f"art-{clean_id}" | ||||
|  | ||||
|     def _replace_xref(self, text: str, xref: CrossReference) -> str: | ||||
|         """Replace a cross-reference instance with semantic markup in the text.""" | ||||
|         # Replace the cross-reference text with a link | ||||
|         text = text.replace( | ||||
|             xref.text, | ||||
|             f'<a href="" data-target="{xref.target}" data-id="{xref.id}" class="cross-ref">{xref.text}</a>', | ||||
|         ) | ||||
|         return text | ||||
|  | ||||
|     def _convert_btx(self, element: ET.Element) -> str: | ||||
|         """ | ||||
|         Convert basic text elements (t_btx, t_btx.seq) to HTML. | ||||
| @@ -79,6 +154,16 @@ class FormexArticleConverter: | ||||
|  | ||||
|         result = element.text or "" | ||||
|  | ||||
|         is_title = element.tag in ("TI", "STI", "TI.ART", "STI.ART") | ||||
|         if not is_title and not element.getchildren(): | ||||
|             # Cross-references should be treated at the deepest level | ||||
|             xrefs = extract_xrefs(element, self.language) | ||||
|             print("Extracted cross-references: ", xrefs) | ||||
|  | ||||
|             for xref in xrefs: | ||||
|                 # Replace the cross-reference text with a link | ||||
|                 result = self._replace_xref(result, xref) | ||||
|  | ||||
|         for child in element: | ||||
|             child_tag = child.tag.replace(self.ns_prefix, "") | ||||
|  | ||||
| @@ -176,7 +261,13 @@ class FormexArticleConverter: | ||||
|                 result += self._convert_btx(child) | ||||
|  | ||||
|             if child.tail: | ||||
|                 result += child.tail | ||||
|                 xrefs = extract_xrefs(child, self.language) | ||||
|                 tail_text = child.tail | ||||
|                 for xref in xrefs: | ||||
|                     # Replace the cross-reference text with a link | ||||
|                     tail_text = self._replace_xref(tail_text, xref) | ||||
|  | ||||
|                 result += tail_text | ||||
|  | ||||
|         return result | ||||
|  | ||||
| @@ -200,6 +291,12 @@ class FormexArticleConverter: | ||||
|                     if no_p is not None and txt is not None: | ||||
|                         num = self._get_text(no_p) | ||||
|                         text = self._get_text(txt) | ||||
|  | ||||
|                         # Handle cross-references within the text | ||||
|                         xrefs = extract_xrefs(txt, self.language) | ||||
|                         for xref in xrefs: | ||||
|                             text = self._replace_xref(text, xref) | ||||
|  | ||||
|                         item_content += f'<span class="item-number">{num}</span> {text}' | ||||
|                 elif child_tag == "P": | ||||
|                     # Regular paragraph | ||||
|   | ||||
| @@ -133,7 +133,7 @@ def article(celex_id: str, article_id: int, language: Language = Language.ENG): | ||||
|         num = article.get("IDENTIFIER").lstrip("0") | ||||
|         if num == str(article_id): | ||||
|             return Response( | ||||
|                 FormexArticleConverter().convert_article(article), | ||||
|                 FormexArticleConverter(language=language).convert_article(article), | ||||
|                 media_type="text/html", | ||||
|             ) | ||||
|  | ||||
|   | ||||
		Reference in New Issue
	
	Block a user