Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
532aa60b
Commit
532aa60b
authored
3 years ago
by
Alexander Schlemmer
Browse files
Options
Downloads
Patches
Plain Diff
folder structure for module corrected
parent
e179b2ce
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/crawl.py
+0
-53
0 additions, 53 deletions
src/crawl.py
src/newcrawler/__init__.py
+0
-0
0 additions, 0 deletions
src/newcrawler/__init__.py
src/newcrawler/crawl.py
+157
-0
157 additions, 0 deletions
src/newcrawler/crawl.py
with
157 additions
and
53 deletions
src/crawl.py
deleted
100755 → 0
+
0
−
53
View file @
e179b2ce
#!/usr/bin/env python
# The prototype for a new crawler for CaosDB
# A. Schlemmer, 06/2021
import
argparse
import
os
import
sys
import
yaml
import
re
def
crawl_cfood
(
dirname
:
str
,
cfood
:
str
):
"""
Crawl a single cfood.
"""
# Load the cfood from a yaml file:
with
open
(
cfood
,
"
r
"
)
as
f
:
cf
=
yaml
.
load
(
f
,
Loader
=
yaml
.
SafeLoader
)
for
currentpath
,
dirs
,
files
in
os
.
walk
(
dirname
):
# for current nodes of type dir look in the list of dirs for matches
# dir is the default
if
current_node
[
"
type
"
]
==
"
dir
"
:
for
dirname
in
dirs
:
pass
elif
current_node
[
"
type
"
]
==
"
file
"
:
for
filename
in
files
:
pass
else
:
# work in progress
pass
def
crawl
(
dirname
:
str
,
cfoods
:
list
[
str
]):
"""
Craw a given file hierarchy.
dirname : the root path of the file tree to be crawled
cfoods : a list of filenames of cfood files
"""
# simplified for testing:
for
cfood
in
cfoods
:
crawl_cfood
(
dirname
,
cfood
)
def
main
():
crawl
(
sys
.
args
[
1
],
[
sys
.
args
[
2
]])
if
__name__
==
"
__main__
"
:
main
()
This diff is collapsed.
Click to expand it.
src/__init__.py
→
src/
newcrawler/
__init__.py
+
0
−
0
View file @
532aa60b
File moved
This diff is collapsed.
Click to expand it.
src/newcrawler/crawl.py
0 → 100755
+
157
−
0
View file @
532aa60b
#!/usr/bin/env python
# The prototype for a new crawler for CaosDB
# A. Schlemmer, 06/2021
import
argparse
import
os
import
sys
import
yaml
import
re
import
json
def
match_file_object
(
node
:
dict
,
filename
:
str
):
"""
Try to match a filename with the supplied node.
This function only uses the current path name specified by filename.
It does not check whether the file system object behind that path is valid
and matching the type of the node.
Parameters
----------
node : A dictionary containing the matcher.
filename : A filename to match.
Returns
-------
A copy of the node with values from the re match object if the node matches.
If it does not match this function returns None.
"""
if
"
value
"
in
node
:
raise
ValueError
(
"
This node already contains a value.
"
)
flags
=
0
if
node
[
"
case
"
]
==
"
insensitive
"
:
flags
+=
re
.
IGNORECASE
regexp
=
node
[
"
re
"
]
pattern
=
re
.
compile
(
regexp
)
matcher
=
re
.
match
(
pattern
,
filename
)
if
matcher
is
None
:
return
None
valnode
=
node
.
copy
()
# Value of node:
# - Add the numeric groups
# - Add the dictionary groups as well
valdict
=
{
0
:
matcher
.
group
()}
for
i
in
range
(
len
(
matcher
.
groups
())):
valdict
[
i
+
1
]
=
matcher
.
group
(
i
+
1
)
for
k
,
v
in
matcher
.
groupdict
().
items
():
valdict
[
k
]
=
v
valnode
[
"
value
"
]
=
valdict
return
valnode
def
get_subnode_with_defaults
(
node
:
dict
,
key
:
str
):
"""
Return the key from node as subnode setting some important defaults for
the cfood specification.
Currently this is:
- Creating an
"
re
"
(regular expression) from the key, if no re is set.
- Add type
"
dir
"
if no type is present.
- Add default case
"
sensitive
"
to the node.
Parameters
----------
node : The dictionary containing the subnode as key.
key : The key of the dictionary.
Returns
-------
A copy of the subnode including the defaults.
"""
if
key
not
in
node
:
raise
ValueError
(
"
Key {} is not in node.
"
.
format
(
key
))
subnode
=
node
[
key
].
copy
()
if
"
re
"
not
in
subnode
:
subnode
[
"
re
"
]
=
re
.
escape
(
key
)
if
"
type
"
not
in
subnode
:
subnode
[
"
type
"
]
=
"
dir
"
if
"
case
"
not
in
subnode
:
subnode
[
"
case
"
]
=
"
sensitive
"
# also add a node name?
return
subnode
def
crawl_cfood
(
dirname
:
str
,
cfood
:
str
):
"""
Crawl a single cfood.
"""
# Load the cfood from a yaml file:
with
open
(
cfood
,
"
r
"
)
as
f
:
cf
=
yaml
.
load
(
f
,
Loader
=
yaml
.
SafeLoader
)
# Current way of determining the root node:
root_node
=
cf
[
"
root
"
]
# Assume root to have a single element (for now):
if
len
(
root_node
)
!=
1
:
raise
ValueError
(
"
Only a single cfood root is allowed.
"
)
current_node
=
get_subnode_with_defaults
(
root_node
,
list
(
root_node
.
keys
())[
0
])
# Strategy: keep a list of currently matching candidates...
for
currentpath
,
dirs
,
files
in
os
.
walk
(
dirname
):
# for current nodes of type dir look in the list of dirs for matches
# dir is the default
if
current_node
[
"
type
"
]
==
"
dir
"
:
for
dirname
in
dirs
:
match
=
match_file_object
(
current_node
,
dirname
)
if
match
is
not
None
:
print
(
json
.
dumps
(
match
,
indent
=
2
))
elif
current_node
[
"
type
"
]
==
"
file
"
:
for
filename
in
files
:
match
=
match_file_object
(
current_node
,
dirname
)
if
match
is
not
None
:
print
(
match
)
else
:
# work in progress
pass
def
crawl
(
dirname
:
str
,
cfoods
:
list
[
str
]):
"""
Craw a given file hierarchy.
dirname : the root path of the file tree to be crawled
cfoods : a list of filenames of cfood files
"""
# simplified for testing:
for
cfood
in
cfoods
:
crawl_cfood
(
dirname
,
cfood
)
def
main
():
crawl
(
sys
.
args
[
1
],
[
sys
.
args
[
2
]])
if
__name__
==
"
__main__
"
:
main
()
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment