fix: Preserve XML tree order in Formex parser
This commit is contained in:
52
tests/test_parser.py
Normal file
52
tests/test_parser.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import pytest
|
||||
from lxml import etree as ET
|
||||
|
||||
from formex_viewer.formex4 import FormexArticleConverter
|
||||
from formex_viewer.main import Language
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def converter():
|
||||
return FormexArticleConverter(language=Language.ENG)
|
||||
|
||||
|
||||
def test_convert_tree_order(converter):
|
||||
"""Test that the order of HTML blocks in the converted article matches the order of elements in the XML tree."""
|
||||
|
||||
xml = """
|
||||
<ARTICLE>
|
||||
<SUBDIV>
|
||||
<TITLE>
|
||||
<TI>Subdivision Title</TI>
|
||||
<STI>Subdivision Subtitle</STI>
|
||||
</TITLE>
|
||||
<PARAG IDENTIFIER="001.001">
|
||||
<NO.PARAG>1</NO.PARAG>
|
||||
<ALINEA>Paragraph 1 text.</ALINEA>
|
||||
</PARAG>
|
||||
<COMMENT>Comment text.</COMMENT>
|
||||
<ALINEA>Alinea text.</ALINEA>
|
||||
<QUOT.S>Quotation text.</QUOT.S>
|
||||
<SUBDIV>
|
||||
<TITLE>
|
||||
<TI>Nested Subdivision</TI>
|
||||
</TITLE>
|
||||
<ALINEA>Nested alinea.</ALINEA>
|
||||
</SUBDIV>
|
||||
</SUBDIV>
|
||||
</ARTICLE>
|
||||
"""
|
||||
parser = ET.XMLParser(remove_blank_text=True)
|
||||
el = ET.fromstring(xml, parser)
|
||||
html = converter.convert_article(el)
|
||||
|
||||
# Check that the order of HTML blocks matches the order of elements in the XML tree
|
||||
idx_title = html.index("Subdivision Title")
|
||||
idx_parag = html.index('class="paragraph"')
|
||||
idx_comment = html.index("Comment text.")
|
||||
idx_alinea = html.index("Alinea text.")
|
||||
idx_quot = html.index("Quotation text.")
|
||||
idx_nested = html.index("Nested Subdivision")
|
||||
|
||||
# The order in the XML: title, parag, alinea, comment, quot, nested subdiv
|
||||
assert idx_title < idx_parag < idx_comment < idx_alinea < idx_quot < idx_nested
|
||||
Reference in New Issue
Block a user