diff --git a/src/caoscrawler/converters.py b/src/caoscrawler/converters.py index 78b7ba2581929411b86f28cb6bdd006cf6f463a6..930a08c167e17efaed39b4b9045589cde68b2e8c 100644 --- a/src/caoscrawler/converters.py +++ b/src/caoscrawler/converters.py @@ -94,9 +94,8 @@ def str_to_bool(x): # However, we should not narrow down the type of the arguments compared to the function definitions # in the parent Converter class. See # - https://mypy.readthedocs.io/en/stable/common_issues.html#incompatible-overrides -# - https://stackoverflow.com/questions/56860/ -# what-is-an-example-of-the-liskov-substitution-principle -# - https://blog.daftcode.pl/covariance-contravariance-and-invariance-the-ultimate-8fabc0c24278 +# - https://stackoverflow.com/questions/56860/what-is-an-example-of-the-liskov-substitution-principle +# - https://blog.daftcode.pl/covariance-contravariance-and-invariance-the-ultimate-python-guide-8fabc0c24278 # Thus, the problem lies in the following design: # Converter instances are supposed to be used by the Crawler in a generic way (The crawler calls # `match` and `typecheck` etc) but the functions are not supposed to be called with generic @@ -157,8 +156,8 @@ def replace_variables(propvalue, values: GeneralStore): def handle_value(value: Union[dict, str, list], values: GeneralStore): - """Determine whether the given value needs to set a property, be added to an existing value - (create a list) or add as an additional property (multiproperty). + """Determine whether the given value needs to set a property, be added to an existing value (create a list) or + add as an additional property (multiproperty). Variable names (starting with a "$") are replaced by the corresponding value stored in the `values` GeneralStore. diff --git a/src/caoscrawler/crawl.py b/src/caoscrawler/crawl.py index b34c2fd3277428b4abdb70514903fe0472853a41..2aeb220cb3279c5bca367305f374218c4ce5c304 100644 --- a/src/caoscrawler/crawl.py +++ b/src/caoscrawler/crawl.py @@ -199,8 +199,8 @@ class Crawler(object): Deprecated arguments: - The debug argument does not have an effect anymore. - - generalStore: This argument does not have an effect anymore. It might be added to the - scanning functions in the scanner module in the future, if needed. + - generalStore: This argument does not have an effect anymore. It might be added to the scanning + functions in the scanner module in the future, if needed. Parameters ---------- @@ -304,10 +304,7 @@ class Crawler(object): def _has_reference_value_without_id(self, ident: Identifiable) -> bool: """ - Return True if there is at least one value without ID. - - Specifically, this checks if there is a value in the properties and backrefs attributes of - ``ident`` which: + Returns True if there is at least one value in the properties attribute of ``ident`` which: a) is a reference property AND b) where the value is set to a @@ -381,15 +378,11 @@ class Crawler(object): # Entity instead of ID and not cached locally if (isinstance(pvalue, list)): for el in pvalue: - if (isinstance(el, db.Entity) and - self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(el, referencing_entities)) - is not None): + if (isinstance(el, db.Entity) and self.get_from_remote_missing_cache( + self.identifiableAdapter.get_identifiable(el, referencing_entities)) is not None): return True - if (isinstance(pvalue, db.Entity) and - self.get_from_remote_missing_cache( - self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)) - is not None): + if (isinstance(pvalue, db.Entity) and self.get_from_remote_missing_cache( + self.identifiableAdapter.get_identifiable(pvalue, referencing_entities)) is not None): # might be checked when reference is resolved return True return False @@ -562,11 +555,10 @@ class Crawler(object): resolved_references = True # flat contains Entities which could not yet be checked against the remote server - pending: list[db.Entity] = [] while resolved_references and len(flat) > 0: resolved_references = False referencing_entities = self.create_reference_mapping( - flat + to_be_updated + pending + to_be_inserted) + flat + to_be_updated + to_be_inserted) # For each element we try to find out whether we can find it in the server or whether # it does not yet exist. Since a Record may reference other unkown Records it might not @@ -588,7 +580,7 @@ class Crawler(object): raise RuntimeError("This should not be reached since treated elements" "are removed from the list") # 1. Can it be identified via an ID? - if record.id is not None: + elif record.id is not None: to_be_updated.append(record) self.add_to_remote_existing_cache(record, identifiable) del flat[i] @@ -616,53 +608,11 @@ class Crawler(object): # We merge the two in order to prevent loss of information newrecord = self.get_from_any_cache(identifiable) try: - merge_entities(newrecord, record, merge_references_with_empty_diffs=False) + merge_entities(newrecord, record) except EntityMergeConflictError: - # Deal with two obvious cases where no merge will ever be possible: - # 1. Two Entities with differing IDs - # 2. Two non-Entity values which differ - for this_p in newrecord.properties: - that_p = record.get_property(this_p.name) - if (isinstance(this_p.value, db.Entity) - and isinstance(that_p.value, db.Entity)): - if this_p.value.id is not None and that_p.value.id is not None: - if this_p.value.id != that_p.value.id: - logger.error("The Crawler is trying to merge two entities " - "because they should be the same object (same" - " identifiables), but they reference " - "different Entities with the same Property." - f"Problematic Property: {this_p.name}\n" - f"Referenced Entities: {this_p.value.id} and " - f"{that_p.value.id}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") - elif (not isinstance(this_p.value, db.Entity) - and not isinstance(that_p.value, db.Entity)): - if ((this_p.value != that_p.value) - # TODO can we also compare lists? - and not isinstance(this_p.value, list) - and not isinstance(that_p.value, list)): - logger.error("The Crawler is trying to merge two entities " - "because they should be the same object (same" - " identifiables), but they have " - "different values for the same Property." - f"Problematic Property: {this_p.name}\n" - f"Values: {this_p.value} and " - f"{that_p.value}\n" - f"{record}\n{newrecord}") - raise RuntimeError("Cannot merge Entities") - pending.append(record) - if newrecord.id is not None: - record.id = newrecord.id - del flat[i] - # the continue prevents record from ever being resolved even if it would - # be possible continue Crawler.bend_references_to_new_object( - old=record, new=newrecord, entities=( - flat + to_be_updated + to_be_inserted + pending)) - referencing_entities = self.create_reference_mapping( - flat + to_be_updated + pending+to_be_inserted) + old=record, new=newrecord, entities=flat + to_be_updated + to_be_inserted) del flat[i] resolved_references = True @@ -697,14 +647,6 @@ class Crawler(object): for record in flat: self.replace_references_with_cached(record, referencing_entities) - for record in pending: - identifiable = self.identifiableAdapter.get_identifiable( - record, - referencing_entities=referencing_entities) - newrecord = self.get_from_any_cache(identifiable) - merge_entities(newrecord, record) - Crawler.bend_references_to_new_object( - old=record, new=newrecord, entities=flat + to_be_updated + to_be_inserted) if len(flat) > 0: circle = self.detect_circular_dependency(flat) if circle is None: @@ -1022,10 +964,8 @@ class Crawler(object): if len(ins_problems) > 0 or len(upd_problems) > 0: raise ForbiddenTransaction( "One or more Records that have a parent which is excluded from inserts or updates." - "\nRecords excluded from inserts have the following RecordTypes:\n" - f"{[el.parents[0].name for el in ins_problems]}" - "\nRecords excluded from updates have the following RecordTypes:\n" - f"{[el.parents[0].name for el in upd_problems]}" + f"\nRecords excluded from inserts have the following RecordTypes:\n{[el.parents[0].name for el in ins_problems]}" + f"\nRecords excluded from updates have the following RecordTypes:\n{[el.parents[0].name for el in upd_problems]}" ) logger.info(f"Going to insert {len(to_be_inserted)} Entities and update " @@ -1206,8 +1146,7 @@ the CaosDB Crawler successfully crawled the data and if get_config_setting("create_crawler_status_records"): domain = get_config_setting("public_host_url") text += ("You can checkout the CrawlerRun Record for more information:\n" - f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27" - "\n\n") + f"{domain}/Entity/?P=0L10&query=find%20crawlerrun%20with%20run_id=%27{run_id}%27\n\n") text += (f"You can download the logfile here:\n{domain}/Shared/" + logfile) send_mail( from_addr=get_config_setting("sendmail_from_address"), @@ -1368,8 +1307,7 @@ def crawler_main(crawled_directory_path: str, if "SHARED_DIR" in os.environ: userlog_public, htmluserlog_public, debuglog_public = configure_server_side_logging() _create_status_record( - get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, - crawler.run_id) + get_config_setting("public_host_url") + "/Shared/" + htmluserlog_public, crawler.run_id) debug_tree = DebugTree() crawled_data = scan_directory( @@ -1415,8 +1353,8 @@ def crawler_main(crawled_directory_path: str, if "SHARED_DIR" in os.environ: domain = get_config_setting("public_host_url") - logger.error("Unexpected Error: Please tell your administrator about this and provide" - f" the following path.\n{domain}/Shared/" + debuglog_public) + logger.error("Unexpected Error: Please tell your administrator about this and provide the" + f" following path.\n{domain}/Shared/" + debuglog_public) _update_status_record(crawler.run_id, 0, 0, status="FAILED") return 1 diff --git a/src/caoscrawler/version.py b/src/caoscrawler/version.py index a92402e2014b23c4db0cc7c39d4db304c803e84f..fdc8323452cd190cc3628efa57c15992f30fabeb 100644 --- a/src/caoscrawler/version.py +++ b/src/caoscrawler/version.py @@ -69,8 +69,7 @@ Crawler version installed on your system: {installed_version} elif cfood_version < installed_version: # only warn if major or minor of installed version are newer than # specified in cfood - if ((cfood_version.major < installed_version.major) - or (cfood_version.minor < installed_version.minor)): + if (cfood_version.major < installed_version.major) or (cfood_version.minor < installed_version.minor): msg = f""" The cfood was written for a previous crawler version. Running the crawler in a newer version than specified in the cfood definition may lead to unwanted or diff --git a/src/doc/conf.py b/src/doc/conf.py index 1599106425c49c7bf536c49b2435a59d372f9b40..a45097e8849329314b6a9529de7a7bf5e205912c 100644 --- a/src/doc/conf.py +++ b/src/doc/conf.py @@ -28,8 +28,8 @@ import sphinx_rtd_theme # noqa: E402 # -- Project information ----------------------------------------------------- -project = 'LinkAhead Crawler' -copyright = '2023, IndiScale' +project = 'caosdb-caoscrawler' +copyright = '2021, MPIDS' author = 'Alexander Schlemmer' # The short X.Y version diff --git a/src/doc/index.rst b/src/doc/index.rst index 6bd30e6827d38f613f9c1836ee4fa1b28d09c1de..20f335f7885971b65caf91dfe723f867e46b8595 100644 --- a/src/doc/index.rst +++ b/src/doc/index.rst @@ -7,22 +7,22 @@ CaosDB-Crawler Documentation :caption: Contents: :hidden: - Getting started <getting_started/index> - Tutorials <tutorials/index> - Concepts <concepts> - Converters <converters> + Getting started<getting_started/index> + Tutorials<tutorials/index> + Concepts<concepts> + Converters<converters> CFoods (Crawler Definitions)<cfood> - Macros <macros> - How to upgrade <how-to-upgrade> - API documentation <_apidoc/modules> - Back to Overview <https://docs.indiscale.com/> + Macros<macros> + How to upgrade<how-to-upgrade> + API documentation<_apidoc/modules> + -This is the documentation for LinkAhead-Crawler (previously known as crawler 2.0) -the main tool for automatic data insertion into LinkAhead. +This is the documentation for CaosDB-Crawler (previously known as crawler 2.0) +the main tool for automatic data insertion into CaosDB. Its task is to automatically synchronize data found on file systems or in other -sources of data with the semantic data model of LinkAhead. +sources of data with the semantic data model of CaosDB. More specifically, data that is contained in a hierarchical structure is converted to a data structure that is consistent with a predefined semantic data model. @@ -31,12 +31,9 @@ The hierarchical structure can be for example a file tree. However it can be also something different like the contents of a JSON file or a file tree with JSON files. -This documentation helps you to :doc:`get started<getting_started/index>`, explains the most -important :doc:`concepts<concepts>` and offers a range of :doc:`tutorials<tutorials/index>`. +This documentation helps you to :doc:`get started<getting_started/index>`, explains the most important +:doc:`concepts<concepts>` and offers a range of :doc:`tutorials<tutorials/index>`. -Or go back to the general `overview`_ of the documentation. - -.. _overview: https://docs.indiscale.com/ Indices and tables ================== diff --git a/unittests/test_crawler.py b/unittests/test_crawler.py index 62184d8a61f80ba85369ea8e50d4a437d88556e8..91e0e86a6d6cf2967ab3567a2ef93b7ccde56e64 100644 --- a/unittests/test_crawler.py +++ b/unittests/test_crawler.py @@ -362,8 +362,7 @@ def test_has_missing_object_in_references(): basic_retrieve_by_name_mock_up, known={"C": db.Record(name="C").add_parent("RTC") .add_property("d").add_property("name"), "D": db.Record(name="D").add_parent("RTD") - .add_property("d").add_property("e") - .add_property("name"), + .add_property("d").add_property("e").add_property("name"), })) # one reference with id -> check