Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
3052a97b
Commit
3052a97b
authored
3 years ago
by
Alexander Schlemmer
Browse files
Options
Downloads
Patches
Plain Diff
FIX: fixed some bugs in markdown conversion
parent
521d8d05
No related branches found
No related tags found
1 merge request
!53
Release 0.1
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
src/newcrawler/converters.py
+27
-18
27 additions, 18 deletions
src/newcrawler/converters.py
tests/scifolder_cfood.yml
+15
-2
15 additions, 2 deletions
tests/scifolder_cfood.yml
tests/test_tool.py
+45
-7
45 additions, 7 deletions
tests/test_tool.py
with
87 additions
and
27 deletions
src/newcrawler/converters.py
+
27
−
18
View file @
3052a97b
...
@@ -29,7 +29,7 @@ import caosdb as db
...
@@ -29,7 +29,7 @@ import caosdb as db
from
.stores
import
GeneralStore
,
RecordStore
from
.stores
import
GeneralStore
,
RecordStore
from
.structure_elements
import
(
StructureElement
,
Directory
,
File
,
from
.structure_elements
import
(
StructureElement
,
Directory
,
File
,
TextElement
,
DictTextElement
,
DictListElement
)
TextElement
,
DictTextElement
,
DictListElement
)
from
typing
import
Type
,
Union
,
Liter
al
from
typing
import
Type
,
Option
al
from
abc
import
abstractmethod
from
abc
import
abstractmethod
import
yaml_header_tools
import
yaml_header_tools
...
@@ -160,11 +160,7 @@ class Converter(object):
...
@@ -160,11 +160,7 @@ class Converter(object):
m
=
self
.
match
(
element
)
m
=
self
.
match
(
element
)
if
m
is
None
:
if
m
is
None
:
raise
RuntimeError
(
"
Condition does not match.
"
)
raise
RuntimeError
(
"
Condition does not match.
"
)
if
type
(
m
)
==
bool
:
values
.
update
(
m
)
if
m
==
False
:
raise
RuntimeError
(
"
Result of match must not be False, use None instead.
"
)
return
values
.
update
(
m
.
groupdict
())
@abstractmethod
@abstractmethod
def
create_children
(
self
,
values
:
GeneralStore
,
def
create_children
(
self
,
values
:
GeneralStore
,
...
@@ -205,7 +201,7 @@ class Converter(object):
...
@@ -205,7 +201,7 @@ class Converter(object):
pass
pass
@abstractmethod
@abstractmethod
def
match
(
self
,
element
:
StructureElement
)
->
Union
[
re
.
Match
,
Literal
[
True
],
None
]:
def
match
(
self
,
element
:
StructureElement
)
->
Optional
[
dict
]:
pass
pass
class
DirectoryConverter
(
Converter
):
class
DirectoryConverter
(
Converter
):
...
@@ -229,7 +225,10 @@ class DirectoryConverter(Converter):
...
@@ -229,7 +225,10 @@ class DirectoryConverter(Converter):
def
match
(
self
,
element
:
StructureElement
):
def
match
(
self
,
element
:
StructureElement
):
if
not
isinstance
(
element
,
Directory
):
if
not
isinstance
(
element
,
Directory
):
raise
RuntimeError
(
"
Element must be a directory.
"
)
raise
RuntimeError
(
"
Element must be a directory.
"
)
return
re
.
match
(
self
.
definition
[
"
match
"
],
element
.
name
)
m
=
re
.
match
(
self
.
definition
[
"
match
"
],
element
.
name
)
if
m
is
None
:
return
None
return
m
.
groupdict
()
@staticmethod
@staticmethod
def
create_children_from_directory
(
element
:
Directory
):
def
create_children_from_directory
(
element
:
Directory
):
...
@@ -263,7 +262,7 @@ class MarkdownFileConverter(Converter):
...
@@ -263,7 +262,7 @@ class MarkdownFileConverter(Converter):
if
not
isinstance
(
element
,
File
):
if
not
isinstance
(
element
,
File
):
raise
RuntimeError
(
"
A markdown file is needed to create children.
"
)
raise
RuntimeError
(
"
A markdown file is needed to create children.
"
)
header
=
yaml_header_tools
.
get_header_from_file
(
element
.
path
)
header
=
yaml_header_tools
.
get_header_from_file
(
element
.
path
,
clean
=
False
)
children
:
list
[
StructureElement
]
=
[]
children
:
list
[
StructureElement
]
=
[]
for
name
,
entry
in
header
.
items
():
for
name
,
entry
in
header
.
items
():
...
@@ -283,11 +282,13 @@ class MarkdownFileConverter(Converter):
...
@@ -283,11 +282,13 @@ class MarkdownFileConverter(Converter):
if
not
isinstance
(
element
,
File
):
if
not
isinstance
(
element
,
File
):
raise
RuntimeError
(
"
Element must be a file.
"
)
raise
RuntimeError
(
"
Element must be a file.
"
)
m
=
re
.
match
(
self
.
definition
[
"
match
"
],
element
.
name
)
m
=
re
.
match
(
self
.
definition
[
"
match
"
],
element
.
name
)
if
m
is
None
:
return
None
try
:
try
:
yaml_header_tools
.
get_header_from_file
(
element
.
path
)
yaml_header_tools
.
get_header_from_file
(
element
.
path
)
except
yaml_header_tools
.
NoValidHeader
:
except
yaml_header_tools
.
NoValidHeader
:
return
None
return
None
return
m
return
m
.
groupdict
()
class
DictTextElementConverter
(
Converter
):
class
DictTextElementConverter
(
Converter
):
def
create_children
(
self
,
generalStore
:
GeneralStore
,
def
create_children
(
self
,
generalStore
:
GeneralStore
,
...
@@ -298,13 +299,19 @@ class DictTextElementConverter(Converter):
...
@@ -298,13 +299,19 @@ class DictTextElementConverter(Converter):
def
typecheck
(
self
,
element
:
StructureElement
):
def
typecheck
(
self
,
element
:
StructureElement
):
return
isinstance
(
element
,
DictTextElement
)
return
isinstance
(
element
,
DictTextElement
)
def
match
(
self
,
element
:
StructureElement
)
->
Union
[
re
.
Match
,
Literal
[
True
],
None
]
:
def
match
(
self
,
element
:
StructureElement
):
if
not
isinstance
(
element
,
DictTextElement
):
if
not
isinstance
(
element
,
DictTextElement
):
raise
RuntimeError
(
"
Element must be a DictTextElement.
"
)
raise
RuntimeError
(
"
Element must be a DictTextElement.
"
)
if
self
.
name
!=
element
.
name
:
m1
=
re
.
match
(
self
.
definition
[
"
match_name
"
],
self
.
name
)
if
m1
is
None
:
return
None
return
None
m
=
re
.
match
(
self
.
definition
[
"
match
"
],
element
.
value
)
m2
=
re
.
match
(
self
.
definition
[
"
match_value
"
],
element
.
value
)
return
m
if
m2
is
None
:
return
None
values
=
dict
()
values
.
update
(
m1
.
groupdict
())
values
.
update
(
m2
.
groupdict
())
return
values
class
DictListElementConverter
(
Converter
):
class
DictListElementConverter
(
Converter
):
def
create_children
(
self
,
generalStore
:
GeneralStore
,
def
create_children
(
self
,
generalStore
:
GeneralStore
,
...
@@ -316,14 +323,14 @@ class DictListElementConverter(Converter):
...
@@ -316,14 +323,14 @@ class DictListElementConverter(Converter):
def
typecheck
(
self
,
element
:
StructureElement
):
def
typecheck
(
self
,
element
:
StructureElement
):
return
isinstance
(
element
,
DictListElement
)
return
isinstance
(
element
,
DictListElement
)
def
match
(
self
,
element
:
StructureElement
)
->
Union
[
re
.
Match
,
Literal
[
True
],
None
]
:
def
match
(
self
,
element
:
StructureElement
):
if
not
isinstance
(
element
,
DictListElement
):
if
not
isinstance
(
element
,
DictListElement
):
raise
RuntimeError
(
"
Element must be a DictListElement.
"
)
raise
RuntimeError
(
"
Element must be a DictListElement.
"
)
if
self
.
name
!=
element
.
name
:
if
self
.
name
!=
element
.
name
:
return
None
return
None
if
"
match
"
in
self
.
definition
:
if
"
match
"
in
self
.
definition
:
raise
NotImplementedError
(
"
Match is not implemented for DictListElement.
"
)
raise
NotImplementedError
(
"
Match is not implemented for DictListElement.
"
)
return
True
return
dict
()
class
TextElementConverter
(
Converter
):
class
TextElementConverter
(
Converter
):
def
create_children
(
self
,
generalStore
:
GeneralStore
,
def
create_children
(
self
,
generalStore
:
GeneralStore
,
...
@@ -333,8 +340,10 @@ class TextElementConverter(Converter):
...
@@ -333,8 +340,10 @@ class TextElementConverter(Converter):
def
typecheck
(
self
,
element
:
StructureElement
):
def
typecheck
(
self
,
element
:
StructureElement
):
return
isinstance
(
element
,
TextElement
)
return
isinstance
(
element
,
TextElement
)
def
match
(
self
,
element
:
StructureElement
)
->
Union
[
re
.
Match
,
Literal
[
True
],
None
]
:
def
match
(
self
,
element
:
StructureElement
):
if
not
isinstance
(
element
,
TextElement
):
if
not
isinstance
(
element
,
TextElement
):
raise
RuntimeError
(
"
Element must be a TextElement.
"
)
raise
RuntimeError
(
"
Element must be a TextElement.
"
)
m
=
re
.
match
(
self
.
definition
[
"
match
"
],
element
.
value
)
m
=
re
.
match
(
self
.
definition
[
"
match
"
],
element
.
value
)
return
m
if
m
is
None
:
return
None
return
m
.
groupdict
()
This diff is collapsed.
Click to expand it.
tests/scifolder_cfood.yml
+
15
−
2
View file @
3052a97b
...
@@ -29,12 +29,25 @@ DataAnalysis: # name of the converter
...
@@ -29,12 +29,25 @@ DataAnalysis: # name of the converter
subtree
:
subtree
:
description
:
description
:
type
:
DictTextElement
type
:
DictTextElement
match
:
(?P<description>.*)
match_value
:
(?P<description>.*)
match_name
:
description
records
:
records
:
Measurement
:
Measurement
:
description
:
$description
description
:
$description
responsible
:
responsible_single
:
type
:
DictTextElement
match_name
:
responsible
match_value
:
(?P<first_name>.+) (?P<last_name>.+)
records
:
Person
:
first_name
:
$first_name
last_name
:
$last_name
Measurement
:
# this uses the reference to the above defined record
responsible
:
+$Person
responsible_list
:
type
:
DictListElement
type
:
DictListElement
match_name
:
responsible
subtree
:
subtree
:
Person
:
Person
:
type
:
TextElement
type
:
TextElement
...
...
This diff is collapsed.
Click to expand it.
tests/test_tool.py
+
45
−
7
View file @
3052a97b
...
@@ -5,7 +5,7 @@
...
@@ -5,7 +5,7 @@
from
newcrawler
import
Crawler
from
newcrawler
import
Crawler
from
newcrawler.converters
import
MarkdownFileConverter
from
newcrawler.converters
import
MarkdownFileConverter
from
newcrawler.structure_elements
import
File
from
newcrawler.structure_elements
import
File
,
DictTextElement
,
DictListElement
from
os.path
import
join
,
dirname
,
basename
from
os.path
import
join
,
dirname
,
basename
import
caosdb
as
db
import
caosdb
as
db
...
@@ -72,16 +72,54 @@ def test_crawler():
...
@@ -72,16 +72,54 @@ def test_crawler():
assert
subd
[
1
][
"
Measurement
"
].
get_property
(
"
project
"
).
value
==
subd
[
0
][
"
Project
"
]
assert
subd
[
1
][
"
Measurement
"
].
get_property
(
"
project
"
).
value
==
subd
[
0
][
"
Project
"
]
def
test_markdown_converter
():
def
test_markdown_converter
():
test_readme
=
File
(
"
README.md
"
,
rfp
(
"
test_directories
"
,
"
examples_article
"
,
"
DataAnalysis
"
,
"
2020_climate-model-predict
"
,
"
2020-02-08_prediction-errors
"
,
"
README.md
"
))
converter
=
MarkdownFileConverter
({
converter
=
MarkdownFileConverter
({
"
match
"
:
"
(.*)
"
"
match
"
:
"
(.*)
"
},
"
TestMarkdownFileConverter
"
)
},
"
TestMarkdownFileConverter
"
)
m
=
converter
.
match
(
File
(
"
README.md
"
,
rfp
(
"
test_directories
"
,
"
examples_article
"
,
"
DataAnalysis
"
,
"
2020_climate-model-predict
"
,
"
2020-02-08_prediction-errors
"
,
"
README.md
"
)))
assert
m
is
not
None
assert
len
(
m
.
groups
())
==
1
m
=
converter
.
match
(
File
(
"
test_tool.py
"
,
rfp
(
m
=
converter
.
match
(
File
(
"
test_tool.py
"
,
rfp
(
"
test_tool.py
"
)))
"
test_tool.py
"
)))
assert
m
is
None
assert
m
is
None
m
=
converter
.
match
(
test_readme
)
assert
m
is
not
None
assert
m
.
__class__
==
dict
assert
len
(
m
)
==
0
converter
=
MarkdownFileConverter
({
"
match
"
:
"
README.md
"
},
"
TestMarkdownFileConverter
"
)
m
=
converter
.
match
(
test_readme
)
assert
m
is
not
None
assert
len
(
m
)
==
0
children
=
converter
.
create_children
(
None
,
test_readme
)
assert
len
(
children
)
==
5
assert
children
[
1
].
__class__
==
DictTextElement
assert
children
[
1
].
name
==
"
description
"
assert
children
[
1
].
value
.
__class__
==
str
assert
children
[
0
].
__class__
==
DictTextElement
assert
children
[
0
].
name
==
"
responsible
"
assert
children
[
0
].
value
.
__class__
==
str
test_readme2
=
File
(
"
README.md
"
,
rfp
(
"
test_directories
"
,
"
examples_article
"
,
"
ExperimentalData
"
,
"
2020_SpeedOfLight
"
,
"
2020-01-01_TimeOfFlight
"
,
"
README.md
"
))
m
=
converter
.
match
(
test_readme2
)
assert
m
is
not
None
assert
len
(
m
)
==
0
children
=
converter
.
create_children
(
None
,
test_readme2
)
assert
len
(
children
)
==
2
assert
children
[
1
].
__class__
==
DictTextElement
assert
children
[
1
].
name
==
"
description
"
assert
children
[
1
].
value
.
__class__
==
str
assert
children
[
0
].
__class__
==
DictListElement
assert
children
[
0
].
name
==
"
responsible
"
assert
children
[
0
].
value
.
__class__
==
list
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment