Skip to content
Snippets Groups Projects
Commit 0795545e authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

update crawler

parent b9818e07
No related branches found
No related tags found
No related merge requests found
......@@ -128,7 +128,7 @@ class AbstractCFood(object):
@staticmethod
# move to api?
def set_property(entity, prop, value):
def remove_property(entity, prop):
# TODO only do something when it is necessary?
if isinstance(prop, db.Entity):
......@@ -138,6 +138,11 @@ class AbstractCFood(object):
while entity.get_property(name) is not None:
entity.remove_property(name)
@staticmethod
# move to api?
def set_property(entity, prop, value):
AbstractCFood.remove_property(entity, prop)
entity.add_property(prop, value)
@staticmethod
......
......@@ -32,6 +32,7 @@ import caosdb as db
import yaml
from .cfood import AbstractCFood, get_entity
from .utils import string_to_person
def _clean_header(header):
......@@ -140,77 +141,6 @@ it is not at the beginning, it must be preceded by a blank line.
# print(headerlines)
raise NoValidHeader(filename)
return (found_1, found_2, _clean_header(yaml_part))
return _clean_header(yaml_part)
raise NoValidHeader(filename)
def parse_responsibles(header):
"""
Extract the responsible person(s) from the yaml header.
If field responsible is a list every entry from that list will be added as a person.
Currently only the format <Firstname> <Lastname> <*> is supported.
If it is a simple string, it is added as the only person.
"""
people = []
if type(header["responsible"]) == list:
resplist = header["responsible"]
else:
resplist = [header["responsible"]]
for person in header["responsible"]:
firstname = person.split(" ")[0]
lastname = person.split(" ")[1]
p = db.Record()
p.add_parent("Person")
p.add_property("lastname", lastname)
p.add_property("firstname", firstname)
people.append(p)
return people
def reference_data_files(entity, header, prefix=""):
referenced_files = []
for glob in get_data_glob(header):
if not glob.startswith("/"):
glob = os.path.normpath(os.path.join(prefix, glob))
for dfile in find_file_for_fileglob(glob):
if dfile is not None:
referenced_files.append(dfile)
AbstractCFood.set_property(entity, get_entity("Data"), referenced_files)
def get_data_glob(header):
if "data" in header:
name = "data"
elif "results" in header:
name = "results"
else:
raise Exception("no suitable header fields")
globs = []
for el in header[name]:
if isinstance(el, dict) and "filename" in el:
globs.append(el["filename"])
else:
globs.append(el)
return globs
def find_file_for_fileglob(glob):
# TODO this probalby needs to be extended for different kinds of glob
query_string = "FIND file which is stored at {}".format(glob)
print(query_string)
return db.execute_query(query_string)
def find_records_referencing_files(glob):
# is it possible to reference the "data records"? anything might
# reference this file
db.execute_query("FIND Record which references file which is stored at")
......@@ -24,6 +24,8 @@
import os
import caosdb as db
def replace_path_prefix(path, old_prefix, new_prefix):
"""
......@@ -31,8 +33,83 @@ def replace_path_prefix(path, old_prefix, new_prefix):
Raises a RuntimeError when the path does not start with old_prefix.
"""
if not path.startswith(old_prefix):
raise RuntimeError("Path does not start with old_prefix")
path = path[len(old_prefix):]
return os.path.join(new_prefix, path)
def string_to_person(person):
"""
Creates a Person Record from a string.
Currently only the format <Firstname> <Lastname> <*> is supported.
"""
firstname = person.split(" ")[0]
lastname = person.split(" ")[1]
pr = db.Record()
pr.add_parent("Person")
pr.add_property("lastname", lastname)
pr.add_property("firstname", firstname)
return pr
def read_field_as_list(field):
"""
E.g. in yaml headers entries can be single values or list. To simplify the
work with those values, this function puts single values in a list.
"""
if isinstance(field, list):
return field
else:
return [field]
def find_file_included_by(glob):
"""
Executes a query that looks for files included by a glob. The glob needs
to be according to CaosDB rules.
"""
query_string = "FIND file which is stored at {}".format(glob)
print(query_string)
return db.execute_query(query_string)
def assure_absolute_path_in_glob(glob, prefix):
"""
Prefixes a relative globs with some path.
Some times files are defined by a relative glob (e.g. "scripts/**"). In
order to search for such files in CaosDB, these globs are prefixed with the
current location.
A relative glob is identified by a missing "/" in the beginning.
"""
if not glob.startswith("/"):
glob = os.path.normpath(os.path.join(prefix, glob))
else:
glob = os.path.normpath(glob)
return glob
def return_field_or_property(value, prop=None):
"""
returns value itself of a property.
Typical in yaml headers is that a field might sometimes contain a single
value and other times a dict itself. This function either returns the
single value or (in case of dict as value) a value of the dict.
"""
if isinstance(value, dict) and prop in el:
return value[prop]
else:
return value
......@@ -26,7 +26,7 @@ from copy import deepcopy
from tempfile import NamedTemporaryFile
import caosdb as db
from caosadvancedtools.read_md_header import get_data_glob, get_header
from caosadvancedtools.read_md_header import get_header
def to_file(string):
......@@ -44,12 +44,21 @@ class ReadHeaderTest(unittest.TestCase):
def tearDown(self):
pass
def test_hash(self):
def test_list(self):
file_content = """
---
data:
- results/**
- results
- data
...
"""
header = get_header(to_file(file_content))[2]
assert get_data_glob(header)[0] == "results/**"
header = get_header(to_file(file_content))
assert type(header["data"]) is list
file_content = """
---
data:
- data
...
"""
header = get_header(to_file(file_content))
assert type(header["data"]) is list
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment