Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
e013b803
Commit
e013b803
authored
3 years ago
by
Alexander Schlemmer
Browse files
Options
Downloads
Patches
Plain Diff
intermediate state of information backend system
parent
bd5d075d
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/newcrawler/crawl.py
+132
-28
132 additions, 28 deletions
src/newcrawler/crawl.py
with
132 additions
and
28 deletions
src/newcrawler/crawl.py
+
132
−
28
View file @
e013b803
...
...
@@ -8,6 +8,7 @@ import sys
import
yaml
import
re
import
json
from
abc
import
abstractmethod
def
match_file_object
(
node
:
dict
,
filename
:
str
):
...
...
@@ -25,13 +26,9 @@ def match_file_object(node: dict,
Returns
-------
A copy of the node with values from the re match object if the node matches.
If it does not match this function returns None.
None if the matcher does not match and otherwise a dict with the values of the matcher.
"""
if
"
value
"
in
node
:
raise
ValueError
(
"
This node already contains a value.
"
)
flags
=
0
if
node
[
"
case
"
]
==
"
insensitive
"
:
flags
+=
re
.
IGNORECASE
...
...
@@ -43,8 +40,6 @@ def match_file_object(node: dict,
if
matcher
is
None
:
return
None
valnode
=
node
.
copy
()
# Value of node:
# - Add the numeric groups
# - Add the dictionary groups as well
...
...
@@ -55,10 +50,7 @@ def match_file_object(node: dict,
for
k
,
v
in
matcher
.
groupdict
().
items
():
valdict
[
k
]
=
v
valnode
[
"
value
"
]
=
valdict
return
valnode
return
valdict
def
get_subnode_with_defaults
(
node
:
dict
,
key
:
str
):
...
...
@@ -78,13 +70,13 @@ def get_subnode_with_defaults(node: dict,
Returns
-------
A copy of t
he subnode including the defaults.
T
he subnode including the defaults.
"""
if
key
not
in
node
:
raise
ValueError
(
"
Key {} is not in node.
"
.
format
(
key
))
subnode
=
node
[
key
]
.
copy
()
subnode
=
node
[
key
]
if
"
re
"
not
in
subnode
:
subnode
[
"
re
"
]
=
re
.
escape
(
key
)
...
...
@@ -95,7 +87,8 @@ def get_subnode_with_defaults(node: dict,
if
"
case
"
not
in
subnode
:
subnode
[
"
case
"
]
=
"
sensitive
"
# also add a node name?
if
"
nodeName
"
not
in
subnode
:
subnode
[
"
nodeName
"
]
=
key
return
subnode
...
...
@@ -118,6 +111,129 @@ def match_complete(node: dict):
return
all
([
match_complete
(
element
)
for
element
in
node
[
"
children
"
]])
return
True
class
InformationBackend
(
object
):
@abstractmethod
def
check_type
(
self
,
current_node
,
current_element
):
return
@abstractmethod
def
list_elements_function
(
self
):
return
@abstractmethod
def
sub_matcher
(
self
,
current_node
,
current_element
):
pass
class
DirectoryInformationBackend
(
InformationBackend
):
def
__init__
(
current_dir
):
self
.
current_dir
=
current_dir
def
check_type
(
self
,
current_node
,
current_element
):
path
=
os
.
path
.
join
(
self
.
current_dir
,
current_element
)
if
current_node
[
"
type
"
]
==
"
dir
"
and
not
os
.
path
.
isdir
(
path
):
return
False
elif
current_node
[
"
type
"
]
==
"
file
"
and
os
.
path
.
isdir
(
path
):
return
False
return
True
def
list_elements_function
(
self
):
return
os
.
listdir
(
self
.
current_dir
)
def
sub_matcher
(
self
,
current_node
,
subelement
):
path
=
os
.
path
.
join
(
self
.
current_dir
,
current_element
)
if
current_node
[
"
type
"
]
==
"
dir
"
:
match_current_dir_node
(
path
,
subelement
)
elif
current_node
[
"
type
"
]
==
"
file
"
:
if
current_node
[
"
representer
"
]
==
"
markdown
"
:
match_markdown_node
(
path
,
subelement
)
else
:
raise
RuntimeError
(
"
Not implemented
"
)
def
match_current_dir_node
(
current_dir
,
current_node
):
"""
Do the recursive matching in the file tree.
"""
for
element
in
os
.
listdir
(
current_dir
):
path
=
os
.
path
.
join
(
current_dir
,
element
)
if
current_node
[
"
type
"
]
==
"
dir
"
and
not
os
.
path
.
isdir
(
path
):
continue
elif
current_node
[
"
type
"
]
==
"
file
"
and
os
.
path
.
isdir
(
path
):
continue
match
=
match_file_object
(
current_node
,
element
)
if
match
is
not
None
:
if
"
value
"
not
in
current_node
:
current_node
[
"
value
"
]
=
[]
current_node
[
"
value
"
].
append
(
match
)
if
"
children
"
in
current_node
:
match
[
"
children
"
]
=
[]
for
subelement_name
in
current_node
[
"
children
"
]:
subelement
=
get_subnode_with_defaults
(
current_node
[
"
children
"
],
subelement_name
).
copy
()
match
[
"
children
"
].
append
(
subelement
)
if
current_node
[
"
type
"
]
==
"
dir
"
:
match_current_dir_node
(
path
,
subelement
)
elif
current_node
[
"
type
"
]
==
"
file
"
:
if
current_node
[
"
representer
"
]
==
"
markdown
"
:
match_markdown_node
(
path
,
subelement
)
def
get_dict_match
(
node
,
key
,
value
):
"""
Try to match a dict element with key and value with the information supplied in node.
This is absolutely work-in-progress also in the specification, e.g.:
- It is currently not possible to match the name with a regexp.
"""
if
node
[
"
type
"
]
==
"
TEXT
"
:
flags
=
0
if
node
[
"
case
"
]
==
"
insensitive
"
:
flags
+=
re
.
IGNORECASE
if
"
re
"
in
node
:
regexp
=
node
[
"
re
"
]
else
:
regexp
=
"
.*
"
pattern
=
re
.
compile
(
regexp
)
matcher
=
re
.
match
(
pattern
,
)
if
matcher
is
None
:
return
None
# Value of node:
# - Add the numeric groups
# - Add the dictionary groups as well
valdict
=
{
0
:
matcher
.
group
()}
for
i
in
range
(
len
(
matcher
.
groups
())):
valdict
[
i
+
1
]
=
matcher
.
group
(
i
+
1
)
for
k
,
v
in
matcher
.
groupdict
().
items
():
valdict
[
k
]
=
v
else
:
raise
RuntimeError
(
"
Only TEXT is supported at the moment.
"
)
return
valdict
def
match_dict_node
(
current_dict
,
current_node
):
for
key
,
value
in
current_dict
:
def
match_markdown_node
(
current_dir
,
current_node
):
import
yaml_header_tools
header
=
yaml_header_tools
.
get_header_from_file
(
current_dir
)
match_dict_node
(
header
,
current_node
)
def
crawl_cfood
(
dirname
:
str
,
cfood
:
str
):
"""
...
...
@@ -136,21 +252,9 @@ def crawl_cfood(dirname: str,
current_node
=
get_subnode_with_defaults
(
root_node
,
list
(
root_node
.
keys
())[
0
])
current_dir
=
dirname
# Strategy: keep a list of currently matching candidates...
match_current_dir_node
(
current_dir
,
current_node
)
matches
=
[]
for
element
in
os
.
listdir
(
current_dir
):
path
=
os
.
path
.
join
(
dirname
,
element
)
if
current_node
[
"
type
"
]
==
"
dir
"
and
os
.
path
.
isdir
(
path
):
match
=
match_file_object
(
current_node
,
dirname
)
if
match
is
not
None
:
matches
.
append
((
path
,
match
))
elif
current_node
[
"
tpye
"
]
==
"
file
"
and
not
os
.
path
.
isdir
(
path
):
match
=
match_file_object
(
current_node
,
dirname
)
if
match
is
not
None
:
matches
.
append
((
path
,
match
))
return
current_node
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment