Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
cd2ff85c
Commit
cd2ff85c
authored
3 years ago
by
Alexander Schlemmer
Browse files
Options
Downloads
Patches
Plain Diff
new version with variable information backends
parent
0e743800
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/newcrawler/crawl.py
+179
-143
179 additions, 143 deletions
src/newcrawler/crawl.py
tests/scifolder_cfood.yml
+1
-0
1 addition, 0 deletions
tests/scifolder_cfood.yml
with
180 additions
and
143 deletions
src/newcrawler/crawl.py
+
179
−
143
View file @
cd2ff85c
...
...
@@ -8,89 +8,12 @@ import sys
import
yaml
import
re
import
json
import
yaml_header_tools
from
abc
import
abstractmethod
def
match_file_object
(
node
:
dict
,
filename
:
str
):
"""
Try to match a filename with the supplied node.
This function only uses the current path name specified by filename.
It does not check whether the file system object behind that path is valid
and matching the type of the node.
Parameters
----------
node : A dictionary containing the matcher.
filename : A filename to match.
Returns
-------
None if the matcher does not match and otherwise a dict with the values of the matcher.
"""
flags
=
0
if
node
[
"
case
"
]
==
"
insensitive
"
:
flags
+=
re
.
IGNORECASE
regexp
=
node
[
"
re
"
]
pattern
=
re
.
compile
(
regexp
)
matcher
=
re
.
match
(
pattern
,
filename
)
if
matcher
is
None
:
return
None
# Value of node:
# - Add the numeric groups
# - Add the dictionary groups as well
valdict
=
{
0
:
matcher
.
group
()}
for
i
in
range
(
len
(
matcher
.
groups
())):
valdict
[
i
+
1
]
=
matcher
.
group
(
i
+
1
)
for
k
,
v
in
matcher
.
groupdict
().
items
():
valdict
[
k
]
=
v
return
valdict
def
get_subnode_with_defaults
(
node
:
dict
,
key
:
str
):
"""
Return the key from node as subnode setting some important defaults for
the cfood specification.
Currently this is:
- Creating an
"
re
"
(regular expression) from the key, if no re is set.
- Add type
"
dir
"
if no type is present.
- Add default case
"
sensitive
"
to the node.
Parameters
----------
node : The dictionary containing the subnode as key.
key : The key of the dictionary.
Returns
-------
The subnode including the defaults.
"""
if
key
not
in
node
:
raise
ValueError
(
"
Key {} is not in node.
"
.
format
(
key
))
subnode
=
node
[
key
]
if
"
re
"
not
in
subnode
:
subnode
[
"
re
"
]
=
re
.
escape
(
key
)
if
"
type
"
not
in
subnode
:
subnode
[
"
type
"
]
=
"
dir
"
if
"
case
"
not
in
subnode
:
subnode
[
"
case
"
]
=
"
sensitive
"
if
"
nodeName
"
not
in
subnode
:
subnode
[
"
nodeName
"
]
=
key
return
subnode
def
match_complete
(
node
:
dict
):
"""
Determine whether the match is complete.
...
...
@@ -121,13 +44,84 @@ class InformationBackend(object):
return
@abstractmethod
def
sub_matcher
(
self
,
current_node
,
current_element
):
def
sub_matcher
(
self
,
current_node
,
current_element
,
subelement
):
pass
class
DirectoryInformationBackend
(
InformationBackend
):
def
__init__
(
current_dir
):
def
__init__
(
self
,
current_dir
):
self
.
current_dir
=
current_dir
def
add_defaults
(
self
,
node
:
dict
):
"""
Return the key from node as subnode setting some important defaults for
the cfood specification.
Currently this is:
- Creating an
"
re
"
(regular expression) from the key, if no re is set.
- Add type
"
dir
"
if no type is present.
- Add default case
"
sensitive
"
to the node.
Parameters
----------
node : The dictionary containing the subnode as key.
key : The key of the dictionary.
Returns
-------
The subnode including the defaults.
"""
if
"
re
"
not
in
node
:
node
[
"
re
"
]
=
re
.
escape
(
node
[
"
nodeName
"
])
if
"
type
"
not
in
node
:
node
[
"
type
"
]
=
"
dir
"
if
"
case
"
not
in
node
:
node
[
"
case
"
]
=
"
sensitive
"
def
match_file_object
(
self
,
current_node
:
dict
,
current_element
:
str
):
"""
Try to match a filename with the supplied current_node.
This function only uses the current path name specified by filename.
It does not check whether the file system object behind that path is valid
and matching the type of the current_node.
Parameters
----------
current_node : A dictionary containing the matcher.
filename : A filename to match.
Returns
-------
None if the matcher does not match and otherwise a dict with the values of the matcher.
"""
flags
=
0
if
current_node
[
"
case
"
]
==
"
insensitive
"
:
flags
+=
re
.
IGNORECASE
regexp
=
current_node
[
"
re
"
]
pattern
=
re
.
compile
(
regexp
)
matcher
=
re
.
match
(
pattern
,
current_element
)
if
matcher
is
None
:
return
None
# Value of current_node:
# - Add the numeric groups
# - Add the dictionary groups as well
valdict
=
{
0
:
matcher
.
group
()}
for
i
in
range
(
len
(
matcher
.
groups
())):
valdict
[
i
+
1
]
=
matcher
.
group
(
i
+
1
)
for
k
,
v
in
matcher
.
groupdict
().
items
():
valdict
[
k
]
=
v
return
valdict
def
check_type
(
self
,
current_node
,
current_element
):
path
=
os
.
path
.
join
(
self
.
current_dir
,
current_element
)
...
...
@@ -141,74 +135,82 @@ class DirectoryInformationBackend(InformationBackend):
def
list_elements_function
(
self
):
return
os
.
listdir
(
self
.
current_dir
)
def
sub_matcher
(
self
,
current_node
,
subelement
):
def
sub_matcher
(
self
,
current_node
,
current_element
,
subelement
):
path
=
os
.
path
.
join
(
self
.
current_dir
,
current_element
)
if
current_node
[
"
type
"
]
==
"
dir
"
:
match_current_dir_node
(
path
,
subelement
)
match_current_dir_node
(
subelement
,
DirectoryInformationBackend
(
path
))
elif
current_node
[
"
type
"
]
==
"
file
"
:
if
current_node
[
"
representer
"
]
==
"
markdown
"
:
match_markdown_node
(
path
,
subelement
)
print
(
"
MARKDOWN
"
)
match_current_dir_node
(
subelement
,
MarkdownInformationBackend
(
path
))
else
:
raise
RuntimeError
(
"
Not implemented
"
)
def
match_current_dir_node
(
current_dir
,
current_node
):
"""
Do the recursive matching in the file tree.
class
MarkdownInformationBackend
(
InformationBackend
):
def
__init__
(
self
,
filename
=
None
,
header
=
None
):
"""
Parameters
----------
filename : str
The filename of the markdown file. If None, header will be used directly.
header : dict
The header dictionary object.
"""
if
filename
is
None
and
header
is
None
:
raise
ValueError
(
"
filename and header cannot both be None.
"
)
if
filename
is
not
None
:
self
.
header
=
yaml_header_tools
.
get_header_from_file
(
filename
,
clean
=
False
)
else
:
self
.
header
=
header
"""
def
add_defaults
(
self
,
node
:
dict
):
if
"
re
"
not
in
node
:
node
[
"
re
"
]
=
"
.*
"
for
element
in
os
.
listdir
(
current_dir
):
path
=
os
.
path
.
join
(
current_dir
,
element
)
if
current_node
[
"
type
"
]
==
"
dir
"
and
not
os
.
path
.
isdir
(
path
):
continue
elif
current_node
[
"
type
"
]
==
"
file
"
and
os
.
path
.
isdir
(
path
):
continue
match
=
match_file_object
(
current_node
,
element
)
if
match
is
not
None
:
if
"
value
"
not
in
current_node
:
current_node
[
"
value
"
]
=
[]
current_node
[
"
value
"
].
append
(
match
)
if
"
type
"
not
in
node
:
node
[
"
type
"
]
=
"
LIST
"
if
"
children
"
in
current_node
:
match
[
"
children
"
]
=
[]
for
subelement_name
in
current_node
[
"
children
"
]:
subelement
=
get_subnode_with_defaults
(
current_node
[
"
children
"
],
subelement_name
).
copy
()
match
[
"
children
"
].
append
(
subelement
)
if
current_node
[
"
type
"
]
==
"
dir
"
:
match_current_dir_node
(
path
,
subelement
)
elif
current_node
[
"
type
"
]
==
"
file
"
:
if
current_node
[
"
representer
"
]
==
"
markdown
"
:
match_markdown_node
(
path
,
subelement
)
def
get_dict_match
(
node
,
key
,
value
):
"""
Try to match a dict element with key and value with the information supplied in node.
if
"
case
"
not
in
node
:
node
[
"
case
"
]
=
"
sensitive
"
This is absolutely work-in-progress also in the specification, e.g.:
- It is currently not possible to match the name with a regexp.
"""
def
match_file_object
(
self
,
current_node
:
dict
,
current_element
:
str
):
"""
Try to match a filename with the supplied current_node.
This function only uses the current path name specified by filename.
It does not check whether the file system object behind that path is valid
and matching the type of the current_node.
Parameters
----------
current_node : A dictionary containing the matcher.
filename : A filename to match.
Returns
-------
None if the matcher does not match and otherwise a dict with the values of the matcher.
"""
if
current_node
[
"
nodeName
"
]
!=
current_element
:
return
None
if
node
[
"
type
"
]
==
"
TEXT
"
:
flags
=
0
if
node
[
"
case
"
]
==
"
insensitive
"
:
if
current_
node
[
"
case
"
]
==
"
insensitive
"
:
flags
+=
re
.
IGNORECASE
if
"
re
"
in
node
:
regexp
=
node
[
"
re
"
]
else
:
regexp
=
"
.*
"
regexp
=
current_node
[
"
re
"
]
pattern
=
re
.
compile
(
regexp
)
matcher
=
re
.
match
(
pattern
,
)
matcher
=
re
.
match
(
pattern
,
self
.
header
[
current_element
]
)
if
matcher
is
None
:
return
None
# Value of node:
# Value of
current_
node:
# - Add the numeric groups
# - Add the dictionary groups as well
...
...
@@ -217,22 +219,55 @@ def get_dict_match(node, key, value):
valdict
[
i
+
1
]
=
matcher
.
group
(
i
+
1
)
for
k
,
v
in
matcher
.
groupdict
().
items
():
valdict
[
k
]
=
v
else
:
raise
RuntimeError
(
"
Only TEXT is supported at the moment.
"
)
return
valdict
def
match_dict_node
(
current_dict
,
current_node
):
for
key
,
value
in
current_dict
:
return
valdict
def
list_elements_function
(
self
):
print
(
list
(
self
.
header
.
keys
()))
return
self
.
header
def
check_type
(
self
,
current_node
,
current_element
):
if
current_node
[
"
type
"
]
==
"
LIST
"
and
not
type
(
self
.
header
[
current_element
])
==
list
:
return
False
if
current_node
[
"
type
"
]
==
"
TEXT
"
and
not
type
(
self
.
header
[
current_element
])
==
str
:
return
False
return
True
def
match_markdown_node
(
current_dir
,
current_node
):
import
yaml_header_tools
def
sub_matcher
(
self
,
current_node
,
current_element
,
subelement
):
print
(
current_node
)
if
current_node
[
"
type
"
]
==
"
LIST
"
:
print
(
"
sub ok
"
)
match_current_dir_node
(
subelement
,
MarkdownInformationBackend
(
header
=
self
.
header
[
current_element
]))
else
:
pass
header
=
yaml_header_tools
.
get_header_from_file
(
current_dir
)
match_dict_node
(
header
,
current_node
)
def
match_current_dir_node
(
current_node
,
information_backend
):
"""
Do the recursive matching in the file tree.
"""
information_backend
.
add_defaults
(
current_node
)
for
element
in
information_backend
.
list_elements_function
():
if
not
information_backend
.
check_type
(
current_node
,
element
):
continue
match
=
information_backend
.
match_file_object
(
current_node
,
element
)
if
match
is
not
None
:
if
"
value
"
not
in
current_node
:
current_node
[
"
value
"
]
=
[]
current_node
[
"
value
"
].
append
(
match
)
if
"
children
"
in
current_node
:
match
[
"
children
"
]
=
[]
for
subelement_name
in
current_node
[
"
children
"
]:
subelement
=
current_node
[
"
children
"
][
subelement_name
].
copy
()
subelement
[
"
nodeName
"
]
=
subelement_name
match
[
"
children
"
].
append
(
subelement
)
information_backend
.
sub_matcher
(
current_node
,
element
,
subelement
)
def
crawl_cfood
(
dirname
:
str
,
cfood
:
str
):
...
...
@@ -249,12 +284,13 @@ def crawl_cfood(dirname: str,
# Assume root to have a single element (for now):
if
len
(
root_node
)
!=
1
:
raise
ValueError
(
"
Only a single cfood root is allowed.
"
)
current_node
=
get_subnode_with_defaults
(
root_node
,
list
(
root_node
.
keys
())[
0
])
current_dir
=
dirname
match_current_dir_node
(
current_dir
,
current_node
)
root_node_name
=
list
(
root_node
.
keys
())[
0
]
root_node
[
root_node_name
][
"
nodeName
"
]
=
root_node_name
match_current_dir_node
(
root_node
[
root_node_name
],
DirectoryInformationBackend
(
dirname
))
return
curren
t_node
return
roo
t_node
...
...
This diff is collapsed.
Click to expand it.
tests/scifolder_cfood.yml
+
1
−
0
View file @
cd2ff85c
...
...
@@ -25,6 +25,7 @@ root:
description
:
type
:
TEXT
responsible
:
type
:
LIST
children
:
person
:
type
:
TEXT
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment