Compare commits
5 Commits
1d467c827a
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
58bd1160c1 | ||
|
|
debaf567ea | ||
|
|
56d271d0df | ||
|
|
f0d4214d17 | ||
|
|
56b5e3e3a4 |
@@ -12,7 +12,13 @@ describe("CelexSelector", () => {
|
|||||||
expect(getByLabelText("Select example:")).toBeInTheDocument();
|
expect(getByLabelText("Select example:")).toBeInTheDocument();
|
||||||
expect(getByRole("combobox")).toBeInTheDocument();
|
expect(getByRole("combobox")).toBeInTheDocument();
|
||||||
|
|
||||||
const options = getAllByRole("option");
|
const [def, ...options] = getAllByRole("option");
|
||||||
|
|
||||||
|
// First option is the disabled placeholder option
|
||||||
|
expect(def).toHaveValue("");
|
||||||
|
expect(def).toHaveTextContent("Select an example");
|
||||||
|
expect(def).toBeDisabled();
|
||||||
|
|
||||||
expect(options).toHaveLength(examples.length);
|
expect(options).toHaveLength(examples.length);
|
||||||
for (const i in examples) {
|
for (const i in examples) {
|
||||||
expect(options[i]).toHaveValue(examples[i].id);
|
expect(options[i]).toHaveValue(examples[i].id);
|
||||||
|
|||||||
@@ -1,16 +1,15 @@
|
|||||||
.toc {
|
.toc {
|
||||||
font-size: 0.8rem;
|
font-size: 0.8rem;
|
||||||
min-width: 25vw;
|
flex: 1 0 25vw;
|
||||||
flex: 1 auto;
|
|
||||||
|
|
||||||
&.hidden {
|
&.hidden {
|
||||||
flex: 0 0;
|
display: none;
|
||||||
min-width: 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
transition: flex-basis 0.1s ease-in-out;
|
||||||
|
|
||||||
overflow-y: scroll;
|
overflow-y: scroll;
|
||||||
overflow-x: wrap;
|
overflow-x: wrap;
|
||||||
height: 100vh;
|
|
||||||
|
|
||||||
.tocDivision {
|
.tocDivision {
|
||||||
margin-block: 0.5rem;
|
margin-block: 0.5rem;
|
||||||
|
|||||||
@@ -55,15 +55,17 @@ function TOC({ toc }: TOCProps) {
|
|||||||
const [isVisible, setIsVisible] = useState(true);
|
const [isVisible, setIsVisible] = useState(true);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<nav className={[styles.toc, isVisible ? "" : styles.hidden].join(" ")}>
|
<>
|
||||||
<button
|
<button
|
||||||
onClick={() => setIsVisible(!isVisible)}
|
onClick={() => setIsVisible(!isVisible)}
|
||||||
className={styles.toggleButton}
|
className={styles.toggleButton}
|
||||||
>
|
>
|
||||||
{isVisible ? "<" : ">"}
|
{isVisible ? "<" : ">"}
|
||||||
</button>
|
</button>
|
||||||
|
<nav className={[styles.toc, isVisible ? "" : styles.hidden].join(" ")}>
|
||||||
{toc.map((division) => renderDivision(division))}
|
{toc.map((division) => renderDivision(division))}
|
||||||
</nav>
|
</nav>
|
||||||
|
</>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
export default TOC;
|
export default TOC;
|
||||||
|
|||||||
@@ -1,5 +1,9 @@
|
|||||||
export const examples = [
|
export const examples = [
|
||||||
{ name: "GDPR", id: "32016R0679" },
|
{ name: "GDPR", id: "32016R0679" },
|
||||||
{ name: "AI Act", id: "32024R1689" },
|
{ name: "AI Act", id: "32024R1689" },
|
||||||
|
{ name: "Cybersecurity Act", id: "32019R0881" },
|
||||||
{ name: "Cyber Resilience Act", id: "32024R2847" },
|
{ name: "Cyber Resilience Act", id: "32024R2847" },
|
||||||
|
{ name: "Medical Device Regulation", id: "32017R0745" },
|
||||||
|
{ name: "NIS 2 Directive", id: "32022L2555" },
|
||||||
|
{ name: "Digital Services Act", id: "32022R2065" },
|
||||||
];
|
];
|
||||||
|
|||||||
@@ -20,3 +20,8 @@ formex-viewer = "formex_viewer:main"
|
|||||||
[build-system]
|
[build-system]
|
||||||
requires = ["hatchling"]
|
requires = ["hatchling"]
|
||||||
build-backend = "hatchling.build"
|
build-backend = "hatchling.build"
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"pytest>=8.3.5",
|
||||||
|
]
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ import html
|
|||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Literal, Optional, Union
|
from typing import Literal, Optional, Union, cast
|
||||||
|
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
from lxml import etree as ET
|
from lxml import etree as ET
|
||||||
@@ -10,7 +10,7 @@ from lxml import etree as ET
|
|||||||
from formex_viewer.main import Language
|
from formex_viewer.main import Language
|
||||||
|
|
||||||
|
|
||||||
def text_content(el: lxml.etree.Element) -> str:
|
def text_content(el: ET._Element) -> str:
|
||||||
"""Get the text content of an XML element, including all child elements."""
|
"""Get the text content of an XML element, including all child elements."""
|
||||||
|
|
||||||
def _iterate(el):
|
def _iterate(el):
|
||||||
@@ -35,7 +35,7 @@ class CrossReference:
|
|||||||
paragraph: int | None = None
|
paragraph: int | None = None
|
||||||
|
|
||||||
|
|
||||||
def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossReference]:
|
def extract_xrefs(el: ET._Element, language: Language) -> list[CrossReference]:
|
||||||
"""Extract cross-references from an XML element.
|
"""Extract cross-references from an XML element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -80,19 +80,23 @@ def extract_xrefs(el: lxml.etree.Element, language: Language) -> list[CrossRefer
|
|||||||
match.group("art_num") if key == "article" else match.group("annex_num")
|
match.group("art_num") if key == "article" else match.group("annex_num")
|
||||||
)
|
)
|
||||||
parag_num = match.groupdict().get("parag_num")
|
parag_num = match.groupdict().get("parag_num")
|
||||||
|
|
||||||
|
if key not in ["article", "annex"]:
|
||||||
|
raise RuntimeError()
|
||||||
|
|
||||||
crossref_text = match.group(0)
|
crossref_text = match.group(0)
|
||||||
crossrefs.append(
|
crossrefs.append(
|
||||||
CrossReference(
|
CrossReference(
|
||||||
target=key,
|
target=key,
|
||||||
id=crossref_id,
|
id=crossref_id,
|
||||||
paragraph=parag_num,
|
paragraph=int(parag_num) if parag_num else None,
|
||||||
text=crossref_text,
|
text=crossref_text,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
return crossrefs
|
return crossrefs
|
||||||
|
|
||||||
|
|
||||||
def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | None:
|
def extract_article(doc: ET._Element, article_id: int) -> ET._Element | None:
|
||||||
"""Extract a specific article from a Formex document.
|
"""Extract a specific article from a Formex document.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -109,8 +113,8 @@ def extract_article(doc: ET.ElementBase, article_id: int) -> ET.ElementBase | No
|
|||||||
|
|
||||||
|
|
||||||
def extract_paragraph(
|
def extract_paragraph(
|
||||||
doc: ET.ElementBase, article_id: int, paragraph_id: int
|
doc: ET._Element, article_id: int, paragraph_id: int
|
||||||
) -> ET.ElementBase | None:
|
) -> ET._Element | None:
|
||||||
"""Extract a specific paragraph from an article in a Formex document.
|
"""Extract a specific paragraph from an article in a Formex document.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -146,7 +150,7 @@ class FormexArticleConverter:
|
|||||||
"""Get the tag name with namespace if available."""
|
"""Get the tag name with namespace if available."""
|
||||||
return f"{self.ns_prefix}{tag}"
|
return f"{self.ns_prefix}{tag}"
|
||||||
|
|
||||||
def _get_text(self, element: ET.Element) -> str:
|
def _get_text(self, element: ET._Element) -> str:
|
||||||
"""Get the text content of an element, including all nested text.
|
"""Get the text content of an element, including all nested text.
|
||||||
|
|
||||||
This uses lxml's text_content() method when available, falling back to
|
This uses lxml's text_content() method when available, falling back to
|
||||||
@@ -161,7 +165,7 @@ class FormexArticleConverter:
|
|||||||
except AttributeError:
|
except AttributeError:
|
||||||
# Fall back to manual traversal if text_content() is not available
|
# Fall back to manual traversal if text_content() is not available
|
||||||
text = element.text or ""
|
text = element.text or ""
|
||||||
for child in element:
|
for child in element.iterchildren(tag="*"):
|
||||||
text += self._get_text(child)
|
text += self._get_text(child)
|
||||||
if child.tail:
|
if child.tail:
|
||||||
text += child.tail
|
text += child.tail
|
||||||
@@ -182,7 +186,7 @@ class FormexArticleConverter:
|
|||||||
)
|
)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _convert_btx(self, element: ET.Element) -> str:
|
def _convert_btx(self, element: ET._Element) -> str:
|
||||||
"""
|
"""
|
||||||
Convert basic text elements (t_btx, t_btx.seq) to HTML.
|
Convert basic text elements (t_btx, t_btx.seq) to HTML.
|
||||||
|
|
||||||
@@ -202,7 +206,7 @@ class FormexArticleConverter:
|
|||||||
# Replace the cross-reference text with a link
|
# Replace the cross-reference text with a link
|
||||||
result = self._replace_xref(result, xref)
|
result = self._replace_xref(result, xref)
|
||||||
|
|
||||||
for child in element:
|
for child in element.iterchildren(tag="*"):
|
||||||
child_tag = child.tag.replace(self.ns_prefix, "")
|
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||||
|
|
||||||
# Process common inline elements
|
# Process common inline elements
|
||||||
@@ -309,7 +313,7 @@ class FormexArticleConverter:
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _convert_list(self, list_element: ET.Element) -> str:
|
def _convert_list(self, list_element: ET._Element) -> str:
|
||||||
"""Convert a Formex LIST element to HTML list items."""
|
"""Convert a Formex LIST element to HTML list items."""
|
||||||
result = ""
|
result = ""
|
||||||
# Using lxml's xpath to get direct child ITEM elements
|
# Using lxml's xpath to get direct child ITEM elements
|
||||||
@@ -347,41 +351,40 @@ class FormexArticleConverter:
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def _convert_alinea(self, alinea: ET.Element) -> str:
|
def _convert_alinea(self, alinea: ET._Element) -> str:
|
||||||
"""Convert an ALINEA element to HTML."""
|
"""Convert an ALINEA element to HTML."""
|
||||||
return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
|
return f'<p class="alinea">{self._convert_btx(alinea)}</p>'
|
||||||
|
|
||||||
def _convert_parag(self, parag: ET.Element) -> str:
|
def _convert_parag(self, parag: ET._Element) -> str:
|
||||||
"""Convert a PARAG (paragraph) element to HTML."""
|
"""Convert a PARAG (paragraph) element to HTML."""
|
||||||
identifier = parag.get("IDENTIFIER", "")
|
identifier = parag.get("IDENTIFIER", "")
|
||||||
parag_id = self._create_id(identifier) if identifier else ""
|
parag_id = self._create_id(identifier) if identifier else ""
|
||||||
|
|
||||||
# Get the paragraph number using XPath
|
|
||||||
no_parag_elems = parag.xpath(f"./{self._get_tag('NO.PARAG')}")
|
|
||||||
parag_num = self._get_text(no_parag_elems[0]) if no_parag_elems else ""
|
|
||||||
|
|
||||||
# Process the alineas within the paragraph
|
|
||||||
content = ""
|
content = ""
|
||||||
for alinea in parag.xpath(f"./{self._get_tag('ALINEA')}"):
|
for child in parag.iterchildren(tag="*"):
|
||||||
content += self._convert_alinea(alinea)
|
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||||
|
if child_tag == "ALINEA":
|
||||||
# Process any comments
|
content += self._convert_alinea(child)
|
||||||
for comment in parag.xpath(f"./{self._get_tag('COMMENT')}"):
|
elif child_tag == "COMMENT":
|
||||||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
content += f'<div class="comment">{self._convert_btx(child)}</div>'
|
||||||
|
elif child_tag == "QUOT.S":
|
||||||
# Process any quotations
|
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
|
||||||
for quot in parag.xpath(f"./{self._get_tag('QUOT.S')}"):
|
elif child_tag == "NO.PARAG":
|
||||||
content += (
|
content += (
|
||||||
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
f'<span class="paragraph-number">{self._convert_btx(child)}</span>'
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Unexpected child element '{child_tag}' in PARAG: {text_content(child)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return f'<div class="paragraph" data-paragraph-id="{parag_id}"><span class="paragraph-number">{parag_num}</span>{content}</div>'
|
return f'<div class="paragraph" data-paragraph-id="{parag_id}">{content}</div>'
|
||||||
|
|
||||||
def _convert_subdiv(self, subdiv: ET.Element) -> str:
|
def _convert_subdiv(self, subdiv: ET._Element) -> str:
|
||||||
"""Convert a SUBDIV (subdivision) element to HTML."""
|
"""Convert a SUBDIV (subdivision) element to HTML, preserving child order."""
|
||||||
# Get the title using XPath
|
# Get the title using XPath (should be the first TITLE child if present)
|
||||||
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
|
|
||||||
title = ""
|
title = ""
|
||||||
|
title_elems = subdiv.xpath(f"./{self._get_tag('TITLE')}")
|
||||||
if title_elems:
|
if title_elems:
|
||||||
title_elem = title_elems[0]
|
title_elem = title_elems[0]
|
||||||
# Process TI (title) and STI (subtitle) elements
|
# Process TI (title) and STI (subtitle) elements
|
||||||
@@ -396,34 +399,30 @@ class FormexArticleConverter:
|
|||||||
if sti_list:
|
if sti_list:
|
||||||
title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'
|
title += f'<h5 class="subdivision-subtitle">{" ".join(sti_list)}</h5>'
|
||||||
|
|
||||||
# Process content: either paragraphs, alineas, or nested subdivisions
|
# Process all children in order, skipping TITLE (already handled)
|
||||||
content = ""
|
content = ""
|
||||||
|
for child in subdiv.iterchildren(tag="*"):
|
||||||
# Process paragraphs directly under this subdivision
|
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||||
for parag in subdiv.xpath(f"./{self._get_tag('PARAG')}"):
|
if child_tag == "TITLE":
|
||||||
content += self._convert_parag(parag)
|
continue # already handled
|
||||||
|
elif child_tag == "PARAG":
|
||||||
# Process alineas directly under this subdivision
|
content += self._convert_parag(child)
|
||||||
for alinea in subdiv.xpath(f"./{self._get_tag('ALINEA')}"):
|
elif child_tag == "ALINEA":
|
||||||
content += self._convert_alinea(alinea)
|
content += self._convert_alinea(child)
|
||||||
|
elif child_tag == "COMMENT":
|
||||||
# Process comments directly under this subdivision
|
content += f'<div class="comment">{self._convert_btx(child)}</div>'
|
||||||
for comment in subdiv.xpath(f"./{self._get_tag('COMMENT')}"):
|
elif child_tag == "QUOT.S":
|
||||||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
|
||||||
|
elif child_tag == "SUBDIV":
|
||||||
# Process quotations directly under this subdivision
|
content += self._convert_subdiv(child)
|
||||||
for quot in subdiv.xpath(f"./{self._get_tag('QUOT.S')}"):
|
else:
|
||||||
content += (
|
raise RuntimeError(
|
||||||
f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
f"Unexpected child element '{child_tag}' in SUBDIV: {text_content(child)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Process nested subdivisions directly under this subdivision
|
|
||||||
for sub in subdiv.xpath(f"./{self._get_tag('SUBDIV')}"):
|
|
||||||
content += self._convert_subdiv(sub)
|
|
||||||
|
|
||||||
return f'<section class="subdivision">{title}{content}</section>'
|
return f'<section class="subdivision">{title}{content}</section>'
|
||||||
|
|
||||||
def convert_article(self, article: Union[str, ET.Element]) -> str:
|
def convert_article(self, article: Union[str, ET._Element]) -> str:
|
||||||
"""
|
"""
|
||||||
Convert a Formex <ARTICLE> element to HTML5.
|
Convert a Formex <ARTICLE> element to HTML5.
|
||||||
|
|
||||||
@@ -437,7 +436,9 @@ class FormexArticleConverter:
|
|||||||
if isinstance(article, str):
|
if isinstance(article, str):
|
||||||
try:
|
try:
|
||||||
parser = ET.XMLParser(remove_blank_text=True)
|
parser = ET.XMLParser(remove_blank_text=True)
|
||||||
article = ET.fromstring(article.encode("utf-8"), parser)
|
article = cast(
|
||||||
|
ET._Element, ET.fromstring(article.encode("utf-8"), parser)
|
||||||
|
)
|
||||||
except ET.XMLSyntaxError as e:
|
except ET.XMLSyntaxError as e:
|
||||||
return f"<p>Error parsing XML: {e}</p>"
|
return f"<p>Error parsing XML: {e}</p>"
|
||||||
|
|
||||||
@@ -471,35 +472,25 @@ class FormexArticleConverter:
|
|||||||
# Process the content based on what's present
|
# Process the content based on what's present
|
||||||
content = ""
|
content = ""
|
||||||
|
|
||||||
# Check if we have alineas directly under the article
|
# Process all child elements (except TITLE) in tree order
|
||||||
alineas = article.xpath(f"./{self._get_tag('ALINEA')}")
|
for child in article.iterchildren(tag="*"):
|
||||||
if alineas:
|
child_tag = child.tag.replace(self.ns_prefix, "")
|
||||||
for alinea in alineas:
|
if child_tag in ["TI.ART", "STI.ART"]:
|
||||||
content += self._convert_alinea(alinea)
|
continue # already handled
|
||||||
|
elif child_tag == "ALINEA":
|
||||||
# Check if we have paragraphs directly under the article
|
content += self._convert_alinea(child)
|
||||||
parags = article.xpath(f"./{self._get_tag('PARAG')}")
|
elif child_tag == "PARAG":
|
||||||
if parags:
|
content += self._convert_parag(child)
|
||||||
for parag in parags:
|
elif child_tag == "COMMENT":
|
||||||
content += self._convert_parag(parag)
|
content += f'<div class="comment">{self._convert_btx(child)}</div>'
|
||||||
|
elif child_tag == "QUOT.S":
|
||||||
# Check for comments directly under the article
|
content += f'<blockquote class="quotation">{self._convert_btx(child)}</blockquote>'
|
||||||
comments = article.xpath(f"./{self._get_tag('COMMENT')}")
|
elif child_tag == "SUBDIV":
|
||||||
if comments:
|
content += self._convert_subdiv(child)
|
||||||
for comment in comments:
|
else:
|
||||||
content += f'<div class="comment">{self._convert_btx(comment)}</div>'
|
raise RuntimeError(
|
||||||
|
f"Unexpected child element '{child_tag}' in ARTICLE: {text_content(child)}"
|
||||||
# Check for quotations directly under the article
|
)
|
||||||
quots = article.xpath(f"./{self._get_tag('QUOT.S')}")
|
|
||||||
if quots:
|
|
||||||
for quot in quots:
|
|
||||||
content += f'<blockquote class="quotation">{self._convert_btx(quot)}</blockquote>'
|
|
||||||
|
|
||||||
# Check for subdivisions directly under the article
|
|
||||||
subdivs = article.xpath(f"./{self._get_tag('SUBDIV')}")
|
|
||||||
if subdivs:
|
|
||||||
for subdiv in subdivs:
|
|
||||||
content += self._convert_subdiv(subdiv)
|
|
||||||
|
|
||||||
# Assemble the complete article
|
# Assemble the complete article
|
||||||
return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'
|
return f'<article id="{article_id}" class="formex-article">{header}<div class="article-content">{content}</div></article>'
|
||||||
|
|||||||
@@ -160,9 +160,14 @@ def paragraph(
|
|||||||
"""
|
"""
|
||||||
xml = _get_fmx4_data(celex_id, language)
|
xml = _get_fmx4_data(celex_id, language)
|
||||||
parag = extract_paragraph(xml, article_id=article_id, paragraph_id=parag_id)
|
parag = extract_paragraph(xml, article_id=article_id, paragraph_id=parag_id)
|
||||||
|
if parag is None:
|
||||||
|
return Response(
|
||||||
|
"Paragraph not found",
|
||||||
|
status_code=404,
|
||||||
|
)
|
||||||
|
|
||||||
return Response(
|
return Response(
|
||||||
FormexArticleConverter(language=language).convert_article(parag),
|
FormexArticleConverter(language=language)._convert_parag(parag),
|
||||||
media_type="text/html",
|
media_type="text/html",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
52
tests/test_parser.py
Normal file
52
tests/test_parser.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
import pytest
|
||||||
|
from lxml import etree as ET
|
||||||
|
|
||||||
|
from formex_viewer.formex4 import FormexArticleConverter
|
||||||
|
from formex_viewer.main import Language
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def converter():
|
||||||
|
return FormexArticleConverter(language=Language.ENG)
|
||||||
|
|
||||||
|
|
||||||
|
def test_convert_tree_order(converter):
|
||||||
|
"""Test that the order of HTML blocks in the converted article matches the order of elements in the XML tree."""
|
||||||
|
|
||||||
|
xml = """
|
||||||
|
<ARTICLE>
|
||||||
|
<SUBDIV>
|
||||||
|
<TITLE>
|
||||||
|
<TI>Subdivision Title</TI>
|
||||||
|
<STI>Subdivision Subtitle</STI>
|
||||||
|
</TITLE>
|
||||||
|
<PARAG IDENTIFIER="001.001">
|
||||||
|
<NO.PARAG>1</NO.PARAG>
|
||||||
|
<ALINEA>Paragraph 1 text.</ALINEA>
|
||||||
|
</PARAG>
|
||||||
|
<COMMENT>Comment text.</COMMENT>
|
||||||
|
<ALINEA>Alinea text.</ALINEA>
|
||||||
|
<QUOT.S>Quotation text.</QUOT.S>
|
||||||
|
<SUBDIV>
|
||||||
|
<TITLE>
|
||||||
|
<TI>Nested Subdivision</TI>
|
||||||
|
</TITLE>
|
||||||
|
<ALINEA>Nested alinea.</ALINEA>
|
||||||
|
</SUBDIV>
|
||||||
|
</SUBDIV>
|
||||||
|
</ARTICLE>
|
||||||
|
"""
|
||||||
|
parser = ET.XMLParser(remove_blank_text=True)
|
||||||
|
el = ET.fromstring(xml, parser)
|
||||||
|
html = converter.convert_article(el)
|
||||||
|
|
||||||
|
# Check that the order of HTML blocks matches the order of elements in the XML tree
|
||||||
|
idx_title = html.index("Subdivision Title")
|
||||||
|
idx_parag = html.index('class="paragraph"')
|
||||||
|
idx_comment = html.index("Comment text.")
|
||||||
|
idx_alinea = html.index("Alinea text.")
|
||||||
|
idx_quot = html.index("Quotation text.")
|
||||||
|
idx_nested = html.index("Nested Subdivision")
|
||||||
|
|
||||||
|
# The order in the XML: title, parag, alinea, comment, quot, nested subdiv
|
||||||
|
assert idx_title < idx_parag < idx_comment < idx_alinea < idx_quot < idx_nested
|
||||||
Reference in New Issue
Block a user