Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • caosdb/src/caosdb-crawler
1 result
Show changes
Commits on Source (15)
...@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added ### ### Added ###
- XMLTextNodeConverter for converting text nodes created by XMLTagConverter
- XMLAttributeNodeConverter for converting attribute nodes created by XMLTagConverter
### Changed ### ### Changed ###
### Deprecated ### ### Deprecated ###
...@@ -21,6 +24,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ...@@ -21,6 +24,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Documentation ### ### Documentation ###
## [0.9.1] - 2024-09-26 ##
### Fixed ###
* ImpossibleMergeErrors now correctly include the problematic property
and its values in their string representation.
## [0.9.0] - 2024-09-05 ## ## [0.9.0] - 2024-09-05 ##
### Added ### ### Added ###
......
...@@ -17,6 +17,6 @@ authors: ...@@ -17,6 +17,6 @@ authors:
given-names: Alexander given-names: Alexander
orcid: https://orcid.org/0000-0003-4124-9649 orcid: https://orcid.org/0000-0003-4124-9649
title: CaosDB - Crawler title: CaosDB - Crawler
version: 0.9.0 version: 0.9.1
doi: 10.3390/data9020024 doi: 10.3390/data9020024
date-released: 2024-09-05 date-released: 2024-09-26
\ No newline at end of file \ No newline at end of file
[metadata] [metadata]
name = caoscrawler name = caoscrawler
version = 0.9.1 version = 0.9.2
author = Alexander Schlemmer author = Alexander Schlemmer
author_email = alexander.schlemmer@ds.mpg.de author_email = alexander.schlemmer@ds.mpg.de
description = A new crawler for caosdb description = A new crawler for LinkAhead
long_description = file: README.md long_description = file: README.md
long_description_content_type = text/markdown long_description_content_type = text/markdown
# url # url
......
...@@ -76,6 +76,7 @@ cfood: ...@@ -76,6 +76,7 @@ cfood:
- XMLFile - XMLFile
- XMLTag - XMLTag
- XMLTextNode - XMLTextNode
- XMLAttributeNode
- PropertiesFromDictElement - PropertiesFromDictElement
description: Type of this converter node. description: Type of this converter node.
match: match:
......
...@@ -195,7 +195,10 @@ class XMLTagConverter(Converter): ...@@ -195,7 +195,10 @@ class XMLTagConverter(Converter):
class XMLTextNodeConverter(Converter): class XMLTextNodeConverter(Converter):
def create_children(self, generalStore: GeneralStore, element: StructureElement): def create_children(self, generalStore: GeneralStore, element: StructureElement):
raise NotImplementedError() """
This converter does not create children.
"""
return []
def typecheck(self, element: StructureElement): def typecheck(self, element: StructureElement):
""" """
...@@ -210,6 +213,47 @@ class XMLTextNodeConverter(Converter): ...@@ -210,6 +213,47 @@ class XMLTextNodeConverter(Converter):
if not isinstance(element, XMLTextNode): if not isinstance(element, XMLTextNode):
raise TypeError("Element must be an instance of XMLTextNode.") raise TypeError("Element must be an instance of XMLTextNode.")
raise NotImplementedError() vardict = {}
m_text = re.match(self.definition["match_text"], element.value,
re.DOTALL)
if m_text is None:
return None
vardict.update(m_text.groupdict())
return vardict
class XMLAttributeNodeConverter(Converter):
def create_children(self, generalStore: GeneralStore, element: StructureElement):
"""
This converter does not create children.
"""
return []
def typecheck(self, element: StructureElement):
"""
Check whether the current structure element can be converted using
this converter.
"""
return isinstance(element, XMLAttributeNode)
def match(self, element: StructureElement) -> Optional[dict]:
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm.
if not isinstance(element, XMLAttributeNode):
raise TypeError("Element must be an instance of XMLAttributeNode.")
vardict = {}
return None m_name = re.match(self.definition["match_name"], element.key)
if m_name is None:
return None
vardict.update(m_name.groupdict())
m_value = re.match(self.definition["match_value"], element.value)
if m_value is None:
return None
vardict.update(m_value.groupdict())
return vardict
...@@ -64,6 +64,7 @@ from linkahead.utils.escape import escape_squoted_text ...@@ -64,6 +64,7 @@ from linkahead.utils.escape import escape_squoted_text
from .config import get_config_setting from .config import get_config_setting
from .converters import Converter, ConverterValidationError from .converters import Converter, ConverterValidationError
from .debug_tree import DebugTree from .debug_tree import DebugTree
from .exceptions import ImpossibleMergeError
from .identifiable_adapters import (CaosDBIdentifiableAdapter, from .identifiable_adapters import (CaosDBIdentifiableAdapter,
IdentifiableAdapter) IdentifiableAdapter)
from .logging import configure_server_side_logging from .logging import configure_server_side_logging
...@@ -597,6 +598,9 @@ one with the entities that need to be updated and the other with entities to be ...@@ -597,6 +598,9 @@ one with the entities that need to be updated and the other with entities to be
unique_names=True): unique_names=True):
Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated) Crawler.set_ids_and_datatype_of_parents_and_properties(to_be_updated)
logger.debug("UPDATE") logger.debug("UPDATE")
# Here, it's probably much more reasonable to show a diff of the update:
# from linkahead.apiutils import compare_entities
# [compare_entities(c, db.Record(id=c.id).retrieve()) for c in to_be_updated]
logger.debug(to_be_updated) logger.debug(to_be_updated)
if len(to_be_updated) > 0: if len(to_be_updated) > 0:
if securityMode.value > SecurityMode.INSERT.value: if securityMode.value > SecurityMode.INSERT.value:
...@@ -1114,6 +1118,14 @@ def crawler_main(crawled_directory_path: str, ...@@ -1114,6 +1118,14 @@ def crawler_main(crawled_directory_path: str,
logger.error(err) logger.error(err)
_update_status_record(crawler.run_id, 0, 0, status="FAILED") _update_status_record(crawler.run_id, 0, 0, status="FAILED")
return 1 return 1
except ImpossibleMergeError as err:
logger.debug(traceback.format_exc())
logger.error(
"Encountered conflicting information when creating Records from the crawled "
f"data:\n\n{err}"
)
_update_status_record(crawler.run_id, 0, 0, status="FAILED")
return 1
except TransactionError as err: except TransactionError as err:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.error(err) logger.error(err)
......
...@@ -111,3 +111,7 @@ XMLTag: ...@@ -111,3 +111,7 @@ XMLTag:
XMLTextNode: XMLTextNode:
converter: XMLTextNodeConverter converter: XMLTextNodeConverter
package: caoscrawler.converters package: caoscrawler.converters
XMLAttributeNode:
converter: XMLAttributeNodeConverter
package: caoscrawler.converters
...@@ -20,6 +20,9 @@ ...@@ -20,6 +20,9 @@
# along with this program. If not, see <https://www.gnu.org/licenses/>. # along with this program. If not, see <https://www.gnu.org/licenses/>.
# #
from typing import Any
class ForbiddenTransaction(Exception): class ForbiddenTransaction(Exception):
"""Thrown if an transactions is needed that is not allowed. """Thrown if an transactions is needed that is not allowed.
For example an update of an entity if the security level is INSERT For example an update of an entity if the security level is INSERT
...@@ -30,12 +33,40 @@ class ForbiddenTransaction(Exception): ...@@ -30,12 +33,40 @@ class ForbiddenTransaction(Exception):
class ImpossibleMergeError(Exception): class ImpossibleMergeError(Exception):
"""Thrown if due to identifying information, two SyncNodes or two Properties of SyncNodes """Thrown if due to identifying information, two SyncNodes or two Properties of SyncNodes
should be merged, but there is conflicting information that prevents this. should be merged, but there is conflicting information that prevents this.
Parameters
----------
msg : str
A case-specific error message describing where the merger error occurred.
pname : str
The name of the property the values of which caused the merge error.
value_a, value_b : Any
The two values that couldn't be merged.
Attributes
----------
message : str
A case-specific error message describing where the merger error occurred.
values : tuple[Any]
The two values that couldn't be merged.
pname : str
The name of the property the values of which caused the merge error.
""" """
def __init__(self, *args, pname, values, **kwargs): def __init__(self, msg: str, pname: str, value_a: Any, value_b: Any):
self.pname = pname self.pname = pname
self.values = values self.values = (value_a, value_b)
super().__init__(self, *args, **kwargs) self.message = msg
super().__init__(self, msg)
def __str__(self):
return (
f"{self.message}\n\nThe problematic property is '{self.pname}' with "
f"values '{self.values[0]}' and '{self.values[1]}'."
)
def __repr__(self):
return self.__str__()
class InvalidIdentifiableYAML(Exception): class InvalidIdentifiableYAML(Exception):
......
...@@ -22,7 +22,6 @@ ...@@ -22,7 +22,6 @@
from __future__ import annotations from __future__ import annotations
import logging
from typing import TYPE_CHECKING, Any, Optional, Union from typing import TYPE_CHECKING, Any, Optional, Union
import linkahead as db import linkahead as db
...@@ -35,8 +34,6 @@ from .exceptions import ImpossibleMergeError ...@@ -35,8 +34,6 @@ from .exceptions import ImpossibleMergeError
if TYPE_CHECKING: if TYPE_CHECKING:
from .identifiable import Identifiable from .identifiable import Identifiable
logger = logging.getLogger(__name__)
class TempID(int): class TempID(int):
"""A special kind of int for negative temporary IDs. """A special kind of int for negative temporary IDs.
...@@ -87,13 +84,25 @@ class SyncNode(db.Entity): ...@@ -87,13 +84,25 @@ class SyncNode(db.Entity):
self.registered_identifiable = registered_identifiable self.registered_identifiable = registered_identifiable
def update(self, other: SyncNode) -> None: def update(self, other: SyncNode) -> None:
"""update this node with information of given ``other`` SyncNode. """Update this node with information of given ``other`` SyncNode.
parents are added if they are not yet in the list properties
are added in any case. This may lead to duplication of
properties. We allow this duplication here and remove it when
we create a db.Entity (export_entity function) because if
property values are SyncNode objects, they might not be
comparable (no ID, no identifiable) yet.
Raises
------
ValueError:
The `other` SyncNode doesn't share identifiables with
`this` SyncNode, so they can't be merged.
ImpossibleMergeError:
The two SyncNodes are incompatible in their attributes
like "id", "role", "path", "file", "name", or
"description".
parents are added if they are not yet in the list
properties are added in any case. This may lead to duplication of properties.
We allow this duplication here and remove it when we create a db.Entity (export_entity
function) because if property values are SyncNode objects, they might not be comparable (no
ID, no identifiable) yet.
""" """
if other.identifiable is not None and self.identifiable is not None: if other.identifiable is not None and self.identifiable is not None:
...@@ -121,8 +130,9 @@ class SyncNode(db.Entity): ...@@ -121,8 +130,9 @@ class SyncNode(db.Entity):
f"Trying to update {attr} but this would lead to an " f"Trying to update {attr} but this would lead to an "
f"override of the value '{self.__getattribute__(attr)}' " f"override of the value '{self.__getattribute__(attr)}' "
f"by the value '{other.__getattribute__(attr)}'", f"by the value '{other.__getattribute__(attr)}'",
pname=attr, values=(self.__getattribute__(attr), pname=attr,
other.__getattribute__(attr)) value_a=self.__getattribute__(attr),
value_b=other.__getattribute__(attr)
) )
for p in other.parents: for p in other.parents:
if not parent_in_list(p, self.parents): if not parent_in_list(p, self.parents):
...@@ -136,6 +146,13 @@ class SyncNode(db.Entity): ...@@ -136,6 +146,13 @@ class SyncNode(db.Entity):
Properties are only added once (based on id or name). If values do not match, an Error is Properties are only added once (based on id or name). If values do not match, an Error is
raised. If values are SyncNode objects with IDs, they are considered equal if their IDs are raised. If values are SyncNode objects with IDs, they are considered equal if their IDs are
equal. equal.
Raises
------
RuntimeError:
In case of a unsupported role, so no Entity can't be created.
ImpossibleMergeError:
In case of conflicting property values in this SyncNode.
""" """
ent = None ent = None
if self.role == "Record": if self.role == "Record":
...@@ -175,16 +192,10 @@ class SyncNode(db.Entity): ...@@ -175,16 +192,10 @@ class SyncNode(db.Entity):
unequal = True unequal = True
if unequal: if unequal:
logger.error(
"The Crawler is trying to create an entity,"
" but there are conflicting property values."
f"Problematic Property: {p.name}\n"
f"First value:\n{entval}\n"
f"Second value:\n{pval}\n"
f"{self}"
)
ime = ImpossibleMergeError( ime = ImpossibleMergeError(
"Cannot merge Entities", pname=p.name, values=(entval, pval) f"The crawler is trying to create an entity \n\n{self}\n\nbut there are "
"conflicting property values.",
pname=p.name, value_a=entval, value_b=pval
) )
raise ime raise ime
return ent return ent
......
...@@ -33,10 +33,10 @@ copyright = '2024, IndiScale' ...@@ -33,10 +33,10 @@ copyright = '2024, IndiScale'
author = 'Alexander Schlemmer' author = 'Alexander Schlemmer'
# The short X.Y version # The short X.Y version
version = '0.9.1' version = '0.9.2'
# The full version, including alpha/beta/rc tags # The full version, including alpha/beta/rc tags
# release = '0.5.2-rc2' # release = '0.5.2-rc2'
release = '0.9.1-dev' release = '0.9.2-dev'
# -- General configuration --------------------------------------------------- # -- General configuration ---------------------------------------------------
......
...@@ -643,7 +643,7 @@ def test_load_converters(): ...@@ -643,7 +643,7 @@ def test_load_converters():
# converter classes can be loaded from their respective packages. # converter classes can be loaded from their respective packages.
# Please adapt, if defaults change! # Please adapt, if defaults change!
assert len(converter_registry) == 28 assert len(converter_registry) == 29
# All of them are contained in caoscrawler.converters # All of them are contained in caoscrawler.converters
# except for the xml converters: # except for the xml converters:
......
...@@ -487,9 +487,17 @@ a: ([b1, b2]) ...@@ -487,9 +487,17 @@ a: ([b1, b2])
# The Bs cannot be merged due to different references to Cs # The Bs cannot be merged due to different references to Cs
with raises(ImpossibleMergeError) as rte: with raises(ImpossibleMergeError) as rte:
crawler._split_into_inserts_and_updates(st) crawler._split_into_inserts_and_updates(st)
# The order of the Cs is random so we only know that they are the
# last two elements but not in which order they have been tried to
# be merged.
assert "The problematic property is 'C' with values " in str(rte.value)
assert f"'[{st.nodes[-2]}]'" in str(rte.value)
assert f"'[{st.nodes[-1]}]'" in str(rte.value)
# TODO # TODO
# assert not isinstance(rte.value, NotImplementedError), \ # assert not isinstance(rte.value, NotImplementedError), \
# "Exception must not be NotImplementedError, but plain RuntimeError." # "Exception must not be NotImplementedError, but plain RuntimeError."
# assert "Could not find referencing entities" in rte.value.args[0] # assert "Could not find referencing entities" in rte.value.args[0]
# assert "merge conflicts in the referencing" in rte.value.args[0] # assert "merge conflicts in the referencing" in rte.value.args[0]
......
...@@ -30,8 +30,7 @@ from test_crawler import (basic_retrieve_by_name_mock_up, ...@@ -30,8 +30,7 @@ from test_crawler import (basic_retrieve_by_name_mock_up,
mock_get_entity_by, mock_get_entity_by,
) )
from caoscrawler.exceptions import (ImpossibleMergeError, from caoscrawler.exceptions import (MissingIdentifyingProperty,
MissingIdentifyingProperty,
MissingRecordType, MissingRecordType,
) )
from caoscrawler.identifiable import Identifiable from caoscrawler.identifiable import Identifiable
......
...@@ -238,8 +238,9 @@ def test_export_node(): ...@@ -238,8 +238,9 @@ def test_export_node():
messages = {str(w.message) for w in caught} messages = {str(w.message) for w in caught}
assert ("Multiproperties are not supported by the crawler.") in messages assert ("Multiproperties are not supported by the crawler.") in messages
with pytest.raises(ImpossibleMergeError): with pytest.raises(ImpossibleMergeError) as ime:
exp = SyncNode(rec_a).export_entity() exp = SyncNode(rec_a).export_entity()
assert "The problematic property is 'a' with values '['b']' and '['a']'" in str(ime.value)
# SyncNodes with same ID are considered equal # SyncNodes with same ID are considered equal
rec_a = (db.Record(id=101) rec_a = (db.Record(id=101)
...@@ -269,18 +270,26 @@ def test_export_node(): ...@@ -269,18 +270,26 @@ def test_export_node():
.add_property(name="a", value=SyncNode(db.Record())) .add_property(name="a", value=SyncNode(db.Record()))
.add_property(name="a", value=SyncNode(db.Record()))) .add_property(name="a", value=SyncNode(db.Record())))
with pytest.raises(ImpossibleMergeError): with pytest.raises(ImpossibleMergeError) as ime:
exp = SyncNode(rec_a).export_entity() exp = SyncNode(rec_a).export_entity()
msg = (f"The problematic property is 'a' with values '[{SyncNode(db.Record())}]' "
f"and '[{SyncNode(db.Record())}]'")
assert msg in str(ime.value)
# different SyncNode Objects with differing ID are not equal # different SyncNode Objects with differing ID are not equal
rec_a = (db.Record(id=101) rec_a = (db.Record(id=101)
.add_parent("B") .add_parent("B")
.add_property(name="a", value=SyncNode(db.Record(id=1))) .add_property(name="a", value=SyncNode(db.Record(id=1)))
.add_property(name="a", value=SyncNode(db.Record(id=2)))) .add_property(name="a", value=SyncNode(db.Record(id=2))))
with pytest.raises(ImpossibleMergeError): with pytest.raises(ImpossibleMergeError) as ime:
exp = SyncNode(rec_a).export_entity() exp = SyncNode(rec_a).export_entity()
msg = (f"The problematic property is 'a' with values '[{SyncNode(db.Record(id=1))}]' "
f"and '[{SyncNode(db.Record(id=2))}]'")
assert msg in str(ime.value)
# SyncNodes with same ID are considered equal (list) # SyncNodes with same ID are considered equal (list)
rec_a = (db.Record(id=101) rec_a = (db.Record(id=101)
.add_parent("B") .add_parent("B")
...@@ -297,9 +306,14 @@ def test_export_node(): ...@@ -297,9 +306,14 @@ def test_export_node():
.add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]) .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))])
.add_property(name="a", value=[SyncNode(db.Record(id=2)), SyncNode(db.Record(id=1))])) .add_property(name="a", value=[SyncNode(db.Record(id=2)), SyncNode(db.Record(id=1))]))
with pytest.raises(ImpossibleMergeError): with pytest.raises(ImpossibleMergeError) as ime:
exp = SyncNode(rec_a).export_entity() exp = SyncNode(rec_a).export_entity()
msg = ("The problematic property is 'a' with values "
f"'{[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=2))]}' "
f"and '{[SyncNode(db.Record(id=2)), SyncNode(db.Record(id=1))]}'")
assert msg in str(ime.value)
# same SyncNode object is obviously equal (list) # same SyncNode object is obviously equal (list)
sn = SyncNode(db.Record(id=1)) sn = SyncNode(db.Record(id=1))
rec_a = (db.Record(id=101) rec_a = (db.Record(id=101)
...@@ -316,26 +330,37 @@ def test_export_node(): ...@@ -316,26 +330,37 @@ def test_export_node():
.add_property(name="a", value=[SyncNode(db.Record())]) .add_property(name="a", value=[SyncNode(db.Record())])
.add_property(name="a", value=[SyncNode(db.Record())])) .add_property(name="a", value=[SyncNode(db.Record())]))
with pytest.raises(ImpossibleMergeError): with pytest.raises(ImpossibleMergeError) as ime:
exp = SyncNode(rec_a).export_entity() exp = SyncNode(rec_a).export_entity()
msg = ("The problematic property is 'a' with values "
f"'{[SyncNode(db.Record())]}' and '{[SyncNode(db.Record())]}'")
assert msg in str(ime.value)
# different SyncNode Objects with differing are not equal (list) # different SyncNode Objects with differing are not equal (list)
rec_a = (db.Record(id=101) rec_a = (db.Record(id=101)
.add_parent("B") .add_parent("B")
.add_property(name="a", value=[SyncNode(db.Record(id=1))]) .add_property(name="a", value=[SyncNode(db.Record(id=1))])
.add_property(name="a", value=[SyncNode(db.Record(id=2))])) .add_property(name="a", value=[SyncNode(db.Record(id=2))]))
with pytest.raises(ImpossibleMergeError): with pytest.raises(ImpossibleMergeError) as ime:
exp = SyncNode(rec_a).export_entity() exp = SyncNode(rec_a).export_entity()
msg = ("The problematic property is 'a' with values "
f"'{[SyncNode(db.Record(id=1))]}' and '{[SyncNode(db.Record(id=2))]}'")
assert msg in str(ime.value)
# list vs no list # list vs no list
rec_a = (db.Record(id=101) rec_a = (db.Record(id=101)
.add_parent("B") .add_parent("B")
.add_property(name="a", value=SyncNode(db.Record(id=1))) .add_property(name="a", value=SyncNode(db.Record(id=1)))
.add_property(name="a", value=[SyncNode(db.Record(id=1))])) .add_property(name="a", value=[SyncNode(db.Record(id=1))]))
with pytest.raises(ImpossibleMergeError): with pytest.raises(ImpossibleMergeError) as ime:
exp = SyncNode(rec_a).export_entity() exp = SyncNode(rec_a).export_entity()
msg = ("The problematic property is 'a' with values "
f"'[{SyncNode(db.Record(id=1))}]' and '{[SyncNode(db.Record(id=1))]}'")
assert msg in str(ime.value)
# different list sizes # different list sizes
rec_a = (db.Record(id=101) rec_a = (db.Record(id=101)
...@@ -343,5 +368,10 @@ def test_export_node(): ...@@ -343,5 +368,10 @@ def test_export_node():
.add_property(name="a", value=[SyncNode(db.Record(id=1))]) .add_property(name="a", value=[SyncNode(db.Record(id=1))])
.add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))])) .add_property(name="a", value=[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))]))
with pytest.raises(ImpossibleMergeError): with pytest.raises(ImpossibleMergeError) as ime:
exp = SyncNode(rec_a).export_entity() exp = SyncNode(rec_a).export_entity()
msg = ("The problematic property is 'a' with values "
f"'{[SyncNode(db.Record(id=1))]}' and "
f"'{[SyncNode(db.Record(id=1)), SyncNode(db.Record(id=1))]}'")
assert msg in str(ime.value)
...@@ -32,7 +32,9 @@ import yaml ...@@ -32,7 +32,9 @@ import yaml
from lxml.etree import fromstring from lxml.etree import fromstring
from pathlib import Path from pathlib import Path
from caoscrawler.converters import XMLTagConverter from caoscrawler.converters import (XMLTagConverter,
XMLAttributeNodeConverter,
XMLTextNodeConverter)
from caoscrawler.scanner import load_definition from caoscrawler.scanner import load_definition
from caoscrawler.stores import GeneralStore from caoscrawler.stores import GeneralStore
from caoscrawler.structure_elements import XMLTagElement from caoscrawler.structure_elements import XMLTagElement
...@@ -51,6 +53,9 @@ def converter_registry(): ...@@ -51,6 +53,9 @@ def converter_registry():
"XMLTextNode": { "XMLTextNode": {
"converter": "XMLTextNodeConverter", "converter": "XMLTextNodeConverter",
"package": "caoscrawler.converters"}, "package": "caoscrawler.converters"},
"XMLAttributeNode": {
"converter": "XMLAttributeNodeConverter",
"package": "caoscrawler.converters"},
} }
for key, value in converter_registry.items(): for key, value in converter_registry.items():
...@@ -294,3 +299,85 @@ nsmap: ...@@ -294,3 +299,85 @@ nsmap:
children = converter.create_children(GeneralStore(), tag) children = converter.create_children(GeneralStore(), tag)
assert len(children) == 1 assert len(children) == 1
assert children[0].name == "{default-namespace}node1[2]/{sub-namespace}node2" assert children[0].name == "{default-namespace}node1[2]/{sub-namespace}node2"
def test_attrib_nodes(converter_registry):
"""
Test attribute node converters.
"""
xml_text = """
<node1 active="true" size="45">
Bla
</node1>
"""
converter = XMLTagConverter(yaml.safe_load("""
type: XMLTag
match_tag: "node1"
xpath: .
tags_as_children: false
attribs_as_children: true
"""), "TestXMLTagConverter", converter_registry)
tag = XMLTagElement(fromstring(xml_text))
m = converter.match(tag)
assert m is not None
children = converter.create_children(GeneralStore(), tag)
assert len(children) == 2
attrib_converter = XMLAttributeNodeConverter(yaml.safe_load("""
type: XMLAttributeNode
match_name: active
match_value: (?P<val>.*)
"""), "TestXMLAttributeNodeConverter", converter_registry)
m = attrib_converter.match(children[1])
assert m is None
m = attrib_converter.match(children[0])
assert m is not None
assert m["val"] == "true"
attrib_converter = XMLAttributeNodeConverter(yaml.safe_load("""
type: XMLAttributeNode
match_name: size
match_value: (?P<val>.*)
"""), "TestXMLAttributeNodeConverter", converter_registry)
m = attrib_converter.match(children[0])
assert m is None
m = attrib_converter.match(children[1])
assert m is not None
assert m["val"] == "45"
def test_text_nodes(converter_registry):
"""
Test text node converters.
"""
xml_text = """
<node1 active="true" size="45">
Bla
</node1>
"""
converter = XMLTagConverter(yaml.safe_load("""
type: XMLTag
match_tag: "node1"
xpath: .
tags_as_children: false
text_as_children: true
"""), "TestXMLTagConverter", converter_registry)
tag = XMLTagElement(fromstring(xml_text))
m = converter.match(tag)
assert m is not None
children = converter.create_children(GeneralStore(), tag)
assert len(children) == 1
attrib_converter = XMLTextNodeConverter(yaml.safe_load("""
type: XMLTextNode
match_text: \s*(?P<val>\w*)\s*
"""), "TestXMLTextNodeConverter", converter_registry)
m = attrib_converter.match(children[0])
assert m is not None
assert m["val"] == "Bla"