Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
ec0d5c36
Commit
ec0d5c36
authored
Jun 26, 2024
by
Alexander Schlemmer
Browse files
Options
Downloads
Patches
Plain Diff
ENH: implemented basic matching of structure elements of type XMLTagElement
parent
366f4b0a
No related branches found
No related tags found
2 merge requests
!181
Release 0.9.0
,
!174
XML Converter
Pipeline
#52486
passed with warnings
Jun 26, 2024
Stage: info
Stage: setup
Stage: cert
Stage: style
Stage: test
Changes
2
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/caoscrawler/xml_converter.py
+47
-8
47 additions, 8 deletions
src/caoscrawler/xml_converter.py
unittests/test_xml_converter.py
+4
-2
4 additions, 2 deletions
unittests/test_xml_converter.py
with
51 additions
and
10 deletions
src/caoscrawler/xml_converter.py
+
47
−
8
View file @
ec0d5c36
...
@@ -28,15 +28,11 @@ import logging
...
@@ -28,15 +28,11 @@ import logging
import
os
import
os
import
re
import
re
import
warnings
import
warnings
from
abc
import
ABCMeta
,
abstractmethod
from
inspect
import
signature
from
inspect
import
signature
from
string
import
Template
from
string
import
Template
from
typing
import
Any
,
Callable
,
Optional
,
Union
from
typing
import
Any
,
Callable
,
Optional
,
Union
import
linkahead
as
db
import
linkahead
as
db
import
pandas
as
pd
import
yaml
import
yaml_header_tools
from
jsonschema
import
ValidationError
,
validate
from
jsonschema
import
ValidationError
,
validate
from
.stores
import
GeneralStore
,
RecordStore
from
.stores
import
GeneralStore
,
RecordStore
...
@@ -89,9 +85,52 @@ class XMLTagConverter(Converter):
...
@@ -89,9 +85,52 @@ class XMLTagConverter(Converter):
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm.
# for a suggestion for the design of the matching algorithm.
if
not
isinstance
(
element
,
XMLTagElement
):
if
not
isinstance
(
element
,
XMLTagElement
):
raise
Argument
Error
(
"
Element must be an instance of XMLTagElement.
"
)
raise
Type
Error
(
"
Element must be an instance of XMLTagElement.
"
)
# Store the result of all individual regexp variable results:
vardict
=
{}
if
"
match_tag
"
in
self
.
definition
:
m_tag
=
re
.
match
(
self
.
definition
[
"
match_tag
"
],
element
.
tag
.
tag
)
if
m_tag
is
None
:
return
None
vardict
.
update
(
m_tag
.
groupdict
())
if
"
match_text
"
in
self
.
definition
:
m_text
=
re
.
match
(
self
.
definition
[
"
match_text
"
],
element
.
tag
.
text
)
if
m_text
is
None
:
return
None
return
None
vardict
.
update
(
m_text
.
groupdict
())
if
"
match_attrib
"
in
self
.
definition
:
for
attrib_def_key
,
attrib_def_value
in
self
.
definition
[
"
match_attrib
"
].
items
():
match_counter
=
0
matched_m_attrib
=
None
matched_m_attrib_value
=
None
for
attr_key
,
attr_value
in
element
.
tag
.
attrib
.
items
():
m_attrib
=
re
.
match
(
attrib_def_key
,
attr_key
)
if
m_attrib
is
not
None
:
match_counter
+=
1
matched_m_attrib
=
m_attrib
m_attrib_value
=
re
.
match
(
attrib_def_value
,
attr_value
)
if
m_attrib_value
is
None
:
breakpoint
()
return
None
matched_m_attrib_value
=
m_attrib_value
# TODO: How to deal with multiple matches?
# There are multiple options:
# - Allow multiple attribute-key matches: Leads to possible overwrites of variables
# - Require unique attribute-key and attribute-value matches: Very complex
# - Only allow one single attribute-key to match and run attribute-value match separately.
# Currently the latter option is implemented.
if
match_counter
==
0
:
return
None
elif
match_counter
>
1
:
raise
RuntimeError
(
"
Multiple attributes match the same match_attrib entry.
"
)
vardict
.
update
(
matched_m_attrib
.
groupdict
())
vardict
.
update
(
matched_m_attrib_value
.
groupdict
())
return
vardict
class
XMLTextNodeConverter
(
Converter
):
class
XMLTextNodeConverter
(
Converter
):
...
@@ -109,6 +148,6 @@ class XMLTextNodeConverter(Converter):
...
@@ -109,6 +148,6 @@ class XMLTextNodeConverter(Converter):
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# See https://gitlab.indiscale.com/caosdb/src/caosdb-crawler/-/issues/145
# for a suggestion for the design of the matching algorithm.
# for a suggestion for the design of the matching algorithm.
if
not
isinstance
(
element
,
XMLTextNode
):
if
not
isinstance
(
element
,
XMLTextNode
):
raise
Argument
Error
(
"
Element must be an instance of XMLTextNode.
"
)
raise
Type
Error
(
"
Element must be an instance of XMLTextNode.
"
)
return
None
return
None
This diff is collapsed.
Click to expand it.
unittests/test_xml_converter.py
+
4
−
2
View file @
ec0d5c36
...
@@ -96,7 +96,7 @@ def test_simple_xml(converter_registry):
...
@@ -96,7 +96,7 @@ def test_simple_xml(converter_registry):
type: XMLTag
type: XMLTag
match_tag: a
match_tag: a
match_attrib: # default is the empty dictionary
match_attrib: # default is the empty dictionary
"
(?P<ref>(href|url))
"
:
"
te
x
t(?P<number>[0-9])
"
# either the
"
href
"
or the
"
url
"
attribute must be set
"
(?P<ref>(href|url))
"
:
"
te
s
t(?P<number>[0-9])
"
# either the
"
href
"
or the
"
url
"
attribute must be set
alt: (.+) # this attribute must be present and contain at least one character
alt: (.+) # this attribute must be present and contain at least one character
match_text: .* # allow any text, also empty (this is the default)
match_text: .* # allow any text, also empty (this is the default)
...
@@ -119,3 +119,5 @@ subtree:
...
@@ -119,3 +119,5 @@ subtree:
m
=
converter
.
match
(
tag
)
m
=
converter
.
match
(
tag
)
assert
m
is
not
None
assert
m
is
not
None
assert
m
[
"
ref
"
]
==
"
href
"
assert
m
[
"
number
"
]
==
"
1
"
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment