Working initial version
This commit is contained in:
2
src/formex_viewer/__init__.py
Normal file
2
src/formex_viewer/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
def main() -> None:
|
||||
print("Hello from formex-viewer!")
|
||||
367
src/formex_viewer/formex4.py
Normal file
367
src/formex_viewer/formex4.py
Normal file
@@ -0,0 +1,367 @@
|
||||
import html
|
||||
import re
|
||||
from typing import Optional, Union
|
||||
|
||||
import lxml.etree
|
||||
from lxml import etree as ET
|
||||
|
||||
|
||||
def text_content(el: lxml.etree.Element) -> str:
|
||||
"""Get the text content of an XML element, including all child elements."""
|
||||
|
||||
def _iterate(el):
|
||||
for child in el.iter():
|
||||
if child.text:
|
||||
yield child.text
|
||||
if child.tail:
|
||||
yield child.tail
|
||||
if el.text:
|
||||
yield el.text
|
||||
if el.tail:
|
||||
yield el.tail
|
||||
|
||||
return "".join(_iterate(el))
|
||||
|
||||
|
||||
class FormexArticleConverter:
|
||||
"""Converts Formex XML <ARTICLE> elements to semantic HTML5."""
|
||||
|
||||
def __init__(self, namespace: Optional[str] = None):
|
||||
"""
|
||||
Initialize the converter.
|
||||
|
||||
Args:
|
||||
namespace: Optional XML namespace to use when parsing elements
|
||||
"""
|
||||
self.ns = namespace
|
||||
self.ns_prefix = f"{{{namespace}}}" if namespace else ""
|
||||
|
||||
def _get_tag(self, tag: str) -> str:
|
||||
"""Get the tag name with namespace if available."""
|
||||
return f"{self.ns_prefix}{tag}"
|
||||
|
||||
def _get_text(self, element: ET.Element) -> str:
|
||||
"""Get the text content of an element, including all nested text.
|
||||
|
||||
This uses lxml's text_content() method when available, falling back to
|
||||
manual traversal for other cases.
|
||||
"""
|
||||
if element is None:
|
||||
return ""
|
||||
|
||||
# Use lxml's built-in text_content() method which is more efficient
|
||||
try:
|
||||
return element.text_content()
|
||||
except AttributeError:
|
||||
# Fall back to manual traversal if text_content() is not available
|
||||
text = element.text or ""
|
||||
for child in element:
|
||||
text += self._get_text(child)
|
||||
if child.tail:
|
||||
text += child.tail
|
||||
return text
|
||||
|
||||
def _create_id(self, identifier: str) -> str:
|
||||
"""Create a valid HTML ID from the article identifier."""
|
||||
# Clean and normalize the identifier for use as an HTML id
|
||||
clean_id = re.sub(r"[^a-zA-Z0-9-]", "-", identifier)
|
||||
return f"art-{clean_id}"
|
||||
|
||||
def _convert_btx(self, element: ET.Element) -> str:
|
||||
"""
|
||||
Convert basic text elements (t_btx, t_btx.seq) to HTML.
|
||||
|
||||
This is a simplified implementation. In a complete version,
|
||||
this would handle all the possible child elements defined in t_btx.
|
||||
"""
|
||||
if element is None:
|
||||
return ""
|
||||
|
||||
result = element.text or ""
|
||||
|
||||
for child in element:
|
||||
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||
|
||||
# Process common inline elements
|
||||
if child_tag == "HT":
|
||||
# Handle highlighted text with appropriate HTML tags
|
||||
ht_type = child.get("TYPE", "NORMAL")
|
||||
if ht_type == "BOLD":
|
||||
result += f"<strong>{self._convert_btx(child)}</strong>"
|
||||
elif ht_type == "ITALIC":
|
||||
result += f"<em>{self._convert_btx(child)}</em>"
|
||||
elif ht_type == "SUB":
|
||||
result += f"<sub>{self._convert_btx(child)}</sub>"
|
||||
elif ht_type == "SUP":
|
||||
result += f"<sup>{self._convert_btx(child)}</sup>"
|
||||
elif ht_type == "UNDERLINE":
|
||||
result += f"<u>{self._convert_btx(child)}</u>"
|
||||
elif ht_type == "SC": # Small caps
|
||||
result += f'<span style="font-variant: small-caps">{self._convert_btx(child)}</span>'
|
||||
else:
|
||||
result += self._convert_btx(child)
|
||||
elif child_tag == "FT":
|
||||
# Format text (numbers, codes, etc.)
|
||||
ft_type = child.get("TYPE", "")
|
||||
if ft_type == "NUMBER" or ft_type == "DECIMAL":
|
||||
result += (
|
||||
f'<span class="ft-number">{self._convert_btx(child)}</span>'
|
||||
)
|
||||
elif ft_type == "CODE":
|
||||
result += f"<code>{self._convert_btx(child)}</code>"
|
||||
else:
|
||||
result += f'<span class="ft-{ft_type.lower()}">{self._convert_btx(child)}</span>'
|
||||
elif child_tag == "IE":
|
||||
# Inclusion/exclusion marker
|
||||
result += '<span class="ie-marker">±</span>'
|
||||
elif child_tag == "BR":
|
||||
# Line break
|
||||
result += "<br>"
|
||||
elif child_tag == "P":
|
||||
# Paragraph
|
||||
result += f"<p>{self._convert_btx(child)}</p>"
|
||||
elif child_tag == "NOTE":
|
||||
# Note reference
|
||||
note_id = child.get("NOTE.ID", "")
|
||||
result += f'<sup class="note-ref" id="{note_id}">{self._convert_btx(child)}</sup>'
|
||||
elif child_tag == "QUOT.START":
|
||||
# Opening quotation mark
|
||||
result += "“"
|
||||
elif child_tag == "QUOT.END":
|
||||
# Closing quotation mark
|
||||
result += "”"
|
||||
elif child_tag == "LIST":
|
||||
# Handle lists
|
||||
|
||||
# Formex styles to CSS list-style-type mapping
|
||||
list_style_map = {
|
||||
"ARAB": "decimal",
|
||||
"ALPHA": "upper-alpha",
|
||||
"alpha": "lower-alpha",
|
||||
"ROMAN": "upper-roman",
|
||||
"roman": "lower-roman",
|
||||
"BULLET": "disc",
|
||||
"DASH": "'—'",
|
||||
"NDASH:": "'–'",
|
||||
"NONE": "none",
|
||||
"OTHER": "none",
|
||||
}
|
||||
|
||||
list_type = child.get("TYPE", "BULLET")
|
||||
list_style_type = list_style_map[list_type]
|
||||
if list_type == "BULLET":
|
||||
result += f"<ul>{self._convert_list(child)}</ul>"
|
||||
elif list_type in ["ARAB", "ALPHA", "alpha", "ROMAN", "roman"]:
|
||||
result += f"<ol class='list-{list_style_type}'>{self._convert_list(child)}</ol>"
|
||||
else:
|
||||
result += f"<ul class='list-{list_style_type}'>{self._convert_list(child)}</ul>"
|
||||
elif child_tag == "TXT":
|
||||
# Simple text element
|
||||
result += html.escape(self._get_text(child))
|
||||
elif child_tag == "LINK":
|
||||
# Handle links (added for lxml version)
|
||||
uri = child.get("URI", "#")
|
||||
result += f'<a href="{uri}">{self._convert_btx(child)}</a>'
|
||||
elif child_tag == "REF.DOC.OJ":
|
||||
# Handle references to OJ documents
|
||||
coll = child.get("COLL", "")
|
||||
no_oj = child.get("NO.OJ", "")
|
||||
date = child.get("DATE.PUB", "")
|
||||
page = child.get("PAGE.FIRST", "")
|
||||
result += (
|
||||
f'<span class="ref-oj">{coll} {no_oj}, {date}, p. {page}</span>'
|
||||
)
|
||||
else:
|
||||
# Recursively process other element types
|
||||
result += self._convert_btx(child)
|
||||
|
||||
if child.tail:
|
||||
result += child.tail
|
||||
|
||||
return result
|
||||
|
||||
def _convert_list(self, list_element: ET.Element) -> str:
|
||||
"""Convert a Formex LIST element to HTML list items."""
|
||||
result = ""
|
||||
# Using lxml's xpath to get direct child ITEM elements
|
||||
for item in list_element.xpath(f"./{self._get_tag('ITEM')}"):
|
||||
item_content = ""
|
||||
# Process ITEM contents which should be either NP or P elements
|
||||
for child in item:
|
||||
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||
if child_tag == "NP":
|
||||
# Numbered paragraph - extract the number and text
|
||||
no_p_elems = child.xpath(f"./{self._get_tag('NO.P')}")
|
||||
txt_elems = child.xpath(f"./{self._get_tag('TXT')}")
|
||||
|
||||
no_p = no_p_elems[0] if no_p_elems else None
|
||||
txt = txt_elems[0] if txt_elems else None
|
||||
|
||||
if no_p is not None and txt is not None:
|
||||
num = self._get_text(no_p)
|
||||
text = self._get_text(txt)
|
||||
item_content += f'<span class="item-number">{num}</span> {text}'
|
||||
elif child_tag == "P":
|
||||
# Regular paragraph
|
||||
item_content += self._convert_btx(child)
|
||||
else:
|
||||
# Other elements
|
||||
item_content += self._convert_btx(child)
|
||||
|
||||
result += f"<li>{item_content}</li>"
|
||||
|
||||
return result
|
||||
|
||||
def _convert_alinea(self, alinea: ET.Element) -> str:
|
||||
"""Convert an ALINEA element to HTML."""
|
||||
return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
|
||||
|
||||
def _convert_parag(self, parag: ET.Element) -> str:
|
||||
"""Convert a PARAG (paragraph) element to HTML."""
|
||||
identifier = parag.get("IDENTIFIER", "")
|
||||
parag_id = self._create_id(identifier) if identifier else ""
|
||||
|
||||
# Get the paragraph number using XPath
|
||||
no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}")
|
||||
parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else ""
|
||||
|
||||
# Process the alineas within the paragraph
|
||||
content = ""
|
||||
for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"):
|
||||
content += self._convert_alinea(alinea)
|
||||
|
||||
# Process any comments
|
||||
for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"):
|
||||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
||||
|
||||
# Process any quotations
|
||||
for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"):
|
||||
content += (
|
||||
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
||||
)
|
||||
|
||||
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
|
||||
|
||||
def _convert_subdiv(self, subdiv: ET.Element) -> str:
|
||||
"""Convert a SUBDIV (subdivision) element to HTML."""
|
||||
# Get the title using XPath
|
||||
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
|
||||
title = ""
|
||||
if title_elems:
|
||||
title_elem = title_elems[0]
|
||||
# Process TI (title) and STI (subtitle) elements
|
||||
ti_elems = title_elem.xpath(f"./{self._get_tag('TI')}")
|
||||
ti_text = self._convert_btx(ti_elems[0]) if ti_elems else ""
|
||||
|
||||
sti_list = []
|
||||
for sti in title_elem.xpath(f"./{self._get_tag('STI')}"):
|
||||
sti_list.append(self._convert_btx(sti))
|
||||
|
||||
title = f'<h4 class="subdivision-title">{ti_text}</h4>'
|
||||
if sti_list:
|
||||
title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'
|
||||
|
||||
# Process content: either paragraphs, alineas, or nested subdivisions
|
||||
content = ""
|
||||
|
||||
# Process paragraphs directly under this subdivision
|
||||
for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"):
|
||||
content += self._convert_parag(parag)
|
||||
|
||||
# Process alineas directly under this subdivision
|
||||
for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"):
|
||||
content += self._convert_alinea(alinea)
|
||||
|
||||
# Process comments directly under this subdivision
|
||||
for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"):
|
||||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
||||
|
||||
# Process quotations directly under this subdivision
|
||||
for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"):
|
||||
content += (
|
||||
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
||||
)
|
||||
|
||||
# Process nested subdivisions directly under this subdivision
|
||||
for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"):
|
||||
content += self._convert_subdiv(sub)
|
||||
|
||||
return f'<section class="subdivision">{title}{content}</section>'
|
||||
|
||||
def convert_article(self, article: Union[str, ET.Element]) -> str:
|
||||
"""
|
||||
Convert a Formex <ARTICLE> element to HTML5.
|
||||
|
||||
Args:
|
||||
article: Either an lxml Element or an XML string representing an ARTICLE
|
||||
|
||||
Returns:
|
||||
A string containing the HTML5 representation of the article
|
||||
"""
|
||||
# Parse the article if it's a string
|
||||
if isinstance(article, str):
|
||||
try:
|
||||
parser = ET.XMLParser(remove_blank_text=True)
|
||||
article = ET.fromstring(article.encode("utf-8"), parser)
|
||||
except ET.XMLSyntaxError as e:
|
||||
return f"<p>Error parsing XML: {e}</p>"
|
||||
|
||||
# Extract the article identifier
|
||||
identifier = article.get("IDENTIFIER", "")
|
||||
article_id = self._create_id(identifier)
|
||||
|
||||
# Strip processing instructions
|
||||
ET.strip_tags(article, lxml.etree.PI)
|
||||
|
||||
# Extract the article title
|
||||
# Use lxml's xpath capabilities for better namespace handling
|
||||
ti_art = article.xpath(f".//{self._get_tag('TI.ART')}")
|
||||
ti_art = ti_art[0] if ti_art else None
|
||||
article_title = self._convert_btx(ti_art) if ti_art is not None else ""
|
||||
|
||||
# Extract the article subtitle if present
|
||||
sti_art = article.xpath(f".//{self._get_tag('STI.ART')}")
|
||||
sti_art = sti_art[0] if sti_art else None
|
||||
article_subtitle = self._convert_btx(sti_art) if sti_art is not None else ""
|
||||
|
||||
# Build the header section
|
||||
header = f'<header><h3 class="article-title">{article_title}</h3>'
|
||||
if article_subtitle:
|
||||
header += f'<h4 class="article-subtitle">{article_subtitle}</h4>'
|
||||
header += "</header>"
|
||||
|
||||
# Process the content based on what's present
|
||||
content = ""
|
||||
|
||||
# Check if we have alineas directly under the article
|
||||
alineas = article.xpath(f"./{self._get_tag('ALINEA')}")
|
||||
if alineas:
|
||||
for alinea in alineas:
|
||||
content += self._convert_alinea(alinea)
|
||||
|
||||
# Check if we have paragraphs directly under the article
|
||||
parags = article.xpath(f"./{self._get_tag('PARAG')}")
|
||||
if parags:
|
||||
for parag in parags:
|
||||
content += self._convert_parag(parag)
|
||||
|
||||
# Check for comments directly under the article
|
||||
comments = article.xpath(f"./{self._get_tag('COMMENT')}")
|
||||
if comments:
|
||||
for comment in comments:
|
||||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
||||
|
||||
# Check for quotations directly under the article
|
||||
quots = article.xpath(f"./{self._get_tag('QUOT.S')}")
|
||||
if quots:
|
||||
for quot in quots:
|
||||
content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
||||
|
||||
# Check for subdivisions directly under the article
|
||||
subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}")
|
||||
if subdivs:
|
||||
for subdiv in subdivs:
|
||||
content += self._convert_subdiv(subdiv)
|
||||
|
||||
# Assemble the complete article
|
||||
return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'
|
||||
232
src/formex_viewer/main.py
Normal file
232
src/formex_viewer/main.py
Normal file
@@ -0,0 +1,232 @@
|
||||
import io
|
||||
import logging
|
||||
import typing
|
||||
import zipfile
|
||||
from dataclasses import dataclass
|
||||
from enum import StrEnum
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from lxml import etree
|
||||
|
||||
from formex_viewer import formex4
|
||||
|
||||
|
||||
class SystemName(StrEnum):
|
||||
CELEX = "celex"
|
||||
CELLAR = "cellar"
|
||||
OJ = "oj"
|
||||
|
||||
|
||||
@dataclass
|
||||
class CellarIdentifier:
|
||||
system_name: SystemName
|
||||
system_id: str
|
||||
|
||||
|
||||
class Language(StrEnum):
|
||||
"""Language enum for Cellar"""
|
||||
|
||||
ENG = "eng"
|
||||
DEU = "deu"
|
||||
FRA = "fra"
|
||||
ITA = "ita"
|
||||
|
||||
|
||||
class ContentType(StrEnum):
|
||||
XML_FMX4 = "application/xml;mtype=fmx4"
|
||||
ZIP_FMX4 = "application/zip;mtype=fmx4"
|
||||
XHTML_XML = "application/xhtml+xml"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
publication_text: CellarIdentifier
|
||||
|
||||
@classmethod
|
||||
def from_xml(cls, xmlstr: str):
|
||||
"""Parse XML metadata"""
|
||||
|
||||
tree = etree.fromstring(xmlstr.encode("utf-8"))
|
||||
url = tree.xpath("//NOTICE/EXPRESSION/URI")[0]
|
||||
|
||||
return cls(
|
||||
publication_text=CellarIdentifier(
|
||||
system_name=SystemName(url.find("TYPE").text),
|
||||
system_id=url.find("IDENTIFIER").text,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def extract_fmx_main_publication(zip_data: typing.BinaryIO) -> str:
|
||||
# Find the main document in a Formex 4 ZIP archive
|
||||
#
|
||||
# Algorithm:
|
||||
# 1. Find the XML file containing the document descriptor (*.doc.fmx.xml)
|
||||
# 2. Parse the XML file to find the main publication (XPath `/DOC/FMX/DOC.MAIN.PUB/REF.PHYS`)
|
||||
# 3. Extract the file from the ZIP archive using the `FILE` attribute value
|
||||
|
||||
with zipfile.ZipFile(zip_data) as z:
|
||||
# Find the document descriptor XML file
|
||||
doc_xml_files = [
|
||||
f for f in z.namelist() if f.endswith((".doc.fmx.xml", ".doc.xml"))
|
||||
]
|
||||
if not doc_xml_files:
|
||||
logging.info("ZIP file contents: %s", z.namelist())
|
||||
raise ValueError(
|
||||
"No document descriptor (*.doc.xml / *.doc.fmx.xml) found in the archive"
|
||||
)
|
||||
|
||||
doc_xml_file = doc_xml_files[0]
|
||||
|
||||
# Parse the XML file
|
||||
with z.open(doc_xml_file) as f:
|
||||
tree = etree.parse(f)
|
||||
|
||||
# Find the main publication reference
|
||||
main_pub_ref = tree.xpath('/DOC/FMX/DOC.MAIN.PUB/REF.PHYS[@TYPE="DOC.XML"]')
|
||||
if not main_pub_ref:
|
||||
raise ValueError(
|
||||
"Main publication reference not found in document descriptor"
|
||||
)
|
||||
|
||||
# Get the FILE attribute
|
||||
main_file = main_pub_ref[0].get("FILE")
|
||||
if not main_file:
|
||||
raise ValueError("FILE attribute not found in main publication reference")
|
||||
|
||||
# Extract the file content
|
||||
with z.open(main_file) as f:
|
||||
return f.read().decode("utf-8")
|
||||
|
||||
|
||||
class CellarClient:
|
||||
def __init__(self, language: Language = Language.ENG):
|
||||
self._client = httpx.Client(
|
||||
base_url="http://publications.europa.eu", follow_redirects=True
|
||||
)
|
||||
self.language = language
|
||||
|
||||
def metadata(self, cellar_id: CellarIdentifier):
|
||||
"""Fetch metadata from Cellar"""
|
||||
|
||||
resp = self._client.get(
|
||||
f"/resource/{cellar_id.system_name}/{cellar_id.system_id}",
|
||||
headers={
|
||||
"Accept": "application/xml;notice=object",
|
||||
"Accept-Language": self.language,
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return Metadata.from_xml(resp.text)
|
||||
|
||||
def publication_text(
|
||||
self,
|
||||
cellar_id: CellarIdentifier,
|
||||
content_type: ContentType,
|
||||
) -> str:
|
||||
"""Fetch a publication from Cellar"""
|
||||
|
||||
metadata = self.metadata(cellar_id)
|
||||
identifier = metadata.publication_text
|
||||
|
||||
resp = self._client.get(
|
||||
f"/resource/{identifier.system_name}/{identifier.system_id}",
|
||||
headers={
|
||||
"Accept": content_type,
|
||||
"Accept-Language": self.language,
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
|
||||
if "zip" in resp.headers.get("Content-Type", ""):
|
||||
return extract_fmx_main_publication(io.BytesIO(resp.content))
|
||||
else:
|
||||
return resp.text
|
||||
|
||||
|
||||
def _get_path(
|
||||
base_path: Path,
|
||||
typ: typing.Literal["recital", "article", "annex"],
|
||||
num: str,
|
||||
extension: str,
|
||||
) -> Path:
|
||||
"""Get the path for a given type and number"""
|
||||
|
||||
plurals = {
|
||||
"recital": "recitals",
|
||||
"article": "articles",
|
||||
"annex": "annexes",
|
||||
}
|
||||
|
||||
return (base_path / plurals[typ] / f"{num}").with_suffix(extension)
|
||||
|
||||
|
||||
class HtmlConverter:
|
||||
"""Convert a Formex 4 XML document to HTML files"""
|
||||
|
||||
extension = ".html"
|
||||
|
||||
def __init__(self, fmx4_content: str, outdir: Path):
|
||||
self._fmx4 = fmx4_content
|
||||
self._xml = etree.fromstring(self._fmx4.encode("utf-8"))
|
||||
self._outdir = outdir
|
||||
self._outdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def convert(self) -> str:
|
||||
"""Split the publication text into separate files.
|
||||
|
||||
Structure:
|
||||
- recitals/: All recitals, one per file
|
||||
- articles/: All articles, one per file
|
||||
- annexes/: All annexes, one per file
|
||||
"""
|
||||
|
||||
# Extract recitals
|
||||
recital_xpath = "//GR.CONSID/CONSID/NP"
|
||||
recitals = self._xml.xpath(recital_xpath)
|
||||
for recital in recitals:
|
||||
num_str = recital.find("NO.P").text
|
||||
num = num_str.strip("()")
|
||||
|
||||
filename = _get_path(self._outdir, "recital", num, self.extension)
|
||||
filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
txt = formex4.text_content(recital.find("TXT"))
|
||||
if txt is None:
|
||||
logging.warning("Recital %s has no text", num)
|
||||
continue
|
||||
filename.write_text(txt, encoding="utf-8")
|
||||
|
||||
# Extract articles
|
||||
# Extract recitals
|
||||
article_xpath = "//ARTICLE"
|
||||
articles = self._xml.xpath(article_xpath)
|
||||
for article in articles:
|
||||
num = article.get("IDENTIFIER").lstrip("0")
|
||||
|
||||
filename = _get_path(self._outdir, "article", num, self.extension)
|
||||
filename.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
txt = formex4.FormexArticleConverter().convert_article(article)
|
||||
filename.write_text(txt, encoding="utf-8")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
# Cyber Resilience Act - CELEX 32024R2847
|
||||
cra_id = CellarIdentifier(system_name=SystemName.CELEX, system_id="32024R2847")
|
||||
|
||||
# AI Act - CELEX 32024R1689
|
||||
aia_id = CellarIdentifier(system_name=SystemName.CELEX, system_id="32024R1689")
|
||||
|
||||
content_type = ContentType.ZIP_FMX4
|
||||
language = Language.ENG
|
||||
|
||||
client = CellarClient(language=language)
|
||||
|
||||
tmpdir = Path("tmp")
|
||||
tmpdir.mkdir(parents=True, exist_ok=True)
|
||||
fmx4_text = client.publication_text(cra_id, content_type, language)
|
||||
converter = HtmlConverter(fmx4_text, Path(tmpdir))
|
||||
converter.convert()
|
||||
0
src/formex_viewer/parser.py
Normal file
0
src/formex_viewer/parser.py
Normal file
82
src/formex_viewer/server.py
Normal file
82
src/formex_viewer/server.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import lxml.etree as ET
|
||||
from fastapi import FastAPI, Response
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
from formex_viewer.formex4 import FormexArticleConverter
|
||||
from formex_viewer.main import (
|
||||
CellarClient,
|
||||
CellarIdentifier,
|
||||
ContentType,
|
||||
Language,
|
||||
SystemName,
|
||||
)
|
||||
|
||||
origins = [
|
||||
"http://localhost:5173",
|
||||
]
|
||||
app = FastAPI()
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
type CacheKey = tuple[str, Language]
|
||||
|
||||
CACHE: dict[CacheKey, str] = {}
|
||||
|
||||
|
||||
def _get_fmx4_data(celex_id: str, language: Language) -> str:
|
||||
"""
|
||||
Fetch the FMX4 data from the server.
|
||||
"""
|
||||
|
||||
if (celex_id, language) in CACHE:
|
||||
return CACHE[(celex_id, language)]
|
||||
|
||||
client = CellarClient(language)
|
||||
cellar_id = CellarIdentifier(
|
||||
system_name=SystemName.CELEX,
|
||||
system_id=celex_id,
|
||||
)
|
||||
fmx4_data = client.publication_text(cellar_id, ContentType.ZIP_FMX4)
|
||||
|
||||
CACHE[(celex_id, language)] = fmx4_data
|
||||
|
||||
return fmx4_data
|
||||
|
||||
|
||||
@app.get("/{celex_id}/articles")
|
||||
def article_ids(celex_id: str, language: Language = Language.ENG):
|
||||
"""
|
||||
Fetch the article IDs from the server.
|
||||
"""
|
||||
fmx4_data = _get_fmx4_data(celex_id, language)
|
||||
xml = ET.fromstring(fmx4_data.encode("utf-8"))
|
||||
|
||||
article_xpath = "//ARTICLE/@IDENTIFIER"
|
||||
article_ids = xml.xpath(article_xpath)
|
||||
article_ids = [int(article_id.lstrip("0")) for article_id in article_ids]
|
||||
article_ids.sort()
|
||||
return article_ids
|
||||
|
||||
|
||||
@app.get("/{celex_id}/articles/{article_id}/{language}")
|
||||
def article(celex_id: str, article_id: int, language: Language = Language.ENG):
|
||||
"""
|
||||
Fetch an article from the server.
|
||||
"""
|
||||
fmx4_data = _get_fmx4_data(celex_id, language)
|
||||
xml = ET.fromstring(fmx4_data.encode("utf-8"))
|
||||
|
||||
article_xpath = "//ARTICLE"
|
||||
articles = xml.xpath(article_xpath)
|
||||
for article in articles:
|
||||
num = article.get("IDENTIFIER").lstrip("0")
|
||||
if num == str(article_id):
|
||||
return Response(
|
||||
FormexArticleConverter().convert_article(article),
|
||||
media_type="text/html",
|
||||
)
|
||||
Reference in New Issue
Block a user