Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
CaosDB Crawler
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Iterations
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
caosdb
Software
CaosDB Crawler
Commits
bd146bd3
Commit
bd146bd3
authored
2 years ago
by
Alexander Schlemmer
Browse files
Options
Downloads
Patches
Plain Diff
MAINT: removed old synchronization function and refactored init method
parent
f1636f29
No related branches found
No related tags found
2 merge requests
!108
Release 0.5.0
,
!104
Create a new scanner module and move functions from crawl module there
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
src/caoscrawler/crawl.py
+28
-36
28 additions, 36 deletions
src/caoscrawler/crawl.py
with
28 additions
and
36 deletions
src/caoscrawler/crawl.py
+
28
−
36
View file @
bd146bd3
...
@@ -4,8 +4,8 @@
...
@@ -4,8 +4,8 @@
# ** header v3.0
# ** header v3.0
# This file is a part of the CaosDB Project.
# This file is a part of the CaosDB Project.
#
#
# Copyright (C) 2021 Henrik tom Wörden
# Copyright (C) 2021
Henrik tom Wörden
<h.tomwoerden@indiscale.com>
# 2021-2023 Alexander Schlemmer
(
alexander.schlemmer@ds.mpg.de
)
# 2021-2023 Alexander Schlemmer
<
alexander.schlemmer@ds.mpg.de
>
#
#
# This program is free software: you can redistribute it and/or modify
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# it under the terms of the GNU Affero General Public License as
...
@@ -67,6 +67,8 @@ from .stores import GeneralStore, RecordStore
...
@@ -67,6 +67,8 @@ from .stores import GeneralStore, RecordStore
from
.structure_elements
import
StructureElement
,
Directory
,
NoneElement
from
.structure_elements
import
StructureElement
,
Directory
,
NoneElement
from
.version
import
check_cfood_version
from
.version
import
check_cfood_version
from
.scanner
import
scan_directory
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
SPECIAL_PROPERTIES_STRICT
=
(
"
description
"
,
"
name
"
,
"
id
"
,
"
path
"
)
SPECIAL_PROPERTIES_STRICT
=
(
"
description
"
,
"
name
"
,
"
id
"
,
"
path
"
)
...
@@ -175,27 +177,13 @@ class Crawler(object):
...
@@ -175,27 +177,13 @@ class Crawler(object):
"""
"""
def
__init__
(
self
,
def
__init__
(
self
,
generalStore
:
Optional
[
GeneralStore
]
=
None
,
identifiableAdapter
:
Optional
[
IdentifiableAdapter
]
=
None
,
debug
:
bool
=
False
,
securityMode
:
SecurityMode
=
SecurityMode
.
UPDATE
):
identifiableAdapter
:
IdentifiableAdapter
=
None
,
securityMode
:
SecurityMode
=
SecurityMode
.
UPDATE
):
"""
"""
Create a new crawler and initialize an empty RecordStore and GeneralStore.
Create a new crawler and initialize an empty RecordStore and GeneralStore.
Parameters
Parameters
----------
----------
recordStore : GeneralStore
An initial GeneralStore which might store e.g. environment variables.
debug : bool
Create a debugging information tree when set to True.
The debugging information tree is a variable stored in
self.debug_tree. It is a dictionary mapping directory entries
to a tuple of general stores and record stores which are valid for
the directory scope.
Furthermore, it is stored in a second tree named self.debug_copied whether the
objects in debug_tree had been copied from a higher level in the hierarchy
of the structureelements.
identifiableAdapter : IdentifiableAdapter
identifiableAdapter : IdentifiableAdapter
TODO describe
TODO describe
securityMode : int
securityMode : int
...
@@ -208,30 +196,30 @@ class Crawler(object):
...
@@ -208,30 +196,30 @@ class Crawler(object):
# different caches.
# different caches.
self
.
remote_existing_cache
=
IdentifiedCache
()
self
.
remote_existing_cache
=
IdentifiedCache
()
self
.
remote_missing_cache
=
IdentifiedCache
()
self
.
remote_missing_cache
=
IdentifiedCache
()
self
.
recordStore
=
RecordStore
()
self
.
securityMode
=
securityMode
self
.
securityMode
=
securityMode
self
.
generalStore
=
generalStore
if
generalStore
is
None
:
self
.
generalStore
=
GeneralStore
()
self
.
identifiableAdapter
:
IdentifiableAdapter
=
LocalStorageIdentifiableAdapter
()
self
.
identifiableAdapter
:
IdentifiableAdapter
=
LocalStorageIdentifiableAdapter
()
if
identifiableAdapter
is
not
None
:
if
identifiableAdapter
is
not
None
:
self
.
identifiableAdapter
=
identifiableAdapter
self
.
identifiableAdapter
=
identifiableAdapter
# If a directory is crawled this may hold the path to that directory
self
.
crawled_directory
:
Optional
[
str
]
=
None
self
.
debug
=
debug
def
crawl_directory
(
self
,
def
synchronize
(
self
,
commit_changes
:
bool
=
True
,
unique_names
=
True
):
crawled_directory
:
str
,
crawler_definition_path
:
str
,
restricted_path
:
Optional
[
list
[
str
]]
=
None
):
"""
"""
Carry out the actual synchronization
.
The new main function to run the crawler on a directory
.
"""
"""
# After the crawling, the actual synchronization with the database, based on the
self
.
crawled_directory
=
crawled_directory
# update list is carried out:
self
.
run_id
=
uuid
.
uuid1
()
# TODO: This is not ideal yet, the data is just returned and needs to be
# separately supplied to the synchronize function.
return
scan_directory
(
crawled_directory
,
crawler_definition_path
,
restricted_path
)
return
self
.
_synchronize
(
self
.
crawled_data
,
commit_changes
,
unique_names
=
unique_names
)
def
_has_reference_value_without_id
(
self
,
ident
:
Identifiable
)
->
bool
:
def
_has_reference_value_without_id
(
self
,
ident
:
Identifiable
)
->
bool
:
"""
"""
...
@@ -704,7 +692,8 @@ class Crawler(object):
...
@@ -704,7 +692,8 @@ class Crawler(object):
return
db
.
Entity
(
id
=
id
).
retrieve
()
return
db
.
Entity
(
id
=
id
).
retrieve
()
@staticmethod
@staticmethod
def
execute_inserts_in_list
(
to_be_inserted
,
securityMode
,
run_id
:
uuid
.
UUID
=
None
,
def
execute_inserts_in_list
(
to_be_inserted
,
securityMode
,
run_id
:
Optional
[
uuid
.
UUID
]
=
None
,
unique_names
=
True
):
unique_names
=
True
):
for
record
in
to_be_inserted
:
for
record
in
to_be_inserted
:
for
prop
in
record
.
properties
:
for
prop
in
record
.
properties
:
...
@@ -732,7 +721,8 @@ class Crawler(object):
...
@@ -732,7 +721,8 @@ class Crawler(object):
_resolve_datatype
(
prop
,
entity
)
_resolve_datatype
(
prop
,
entity
)
@staticmethod
@staticmethod
def
execute_updates_in_list
(
to_be_updated
,
securityMode
,
run_id
:
uuid
.
UUID
=
None
,
def
execute_updates_in_list
(
to_be_updated
,
securityMode
,
run_id
:
Optional
[
uuid
.
UUID
]
=
None
,
unique_names
=
True
):
unique_names
=
True
):
Crawler
.
set_ids_and_datatype_of_parents_and_properties
(
to_be_updated
)
Crawler
.
set_ids_and_datatype_of_parents_and_properties
(
to_be_updated
)
logger
.
debug
(
"
UPDATE
"
)
logger
.
debug
(
"
UPDATE
"
)
...
@@ -744,8 +734,10 @@ class Crawler(object):
...
@@ -744,8 +734,10 @@ class Crawler(object):
update_cache
=
UpdateCache
()
update_cache
=
UpdateCache
()
update_cache
.
insert
(
to_be_updated
,
run_id
)
update_cache
.
insert
(
to_be_updated
,
run_id
)
def
_synchronize
(
self
,
crawled_data
:
list
[
db
.
Record
],
commit_changes
:
bool
=
True
,
def
synchronize
(
self
,
unique_names
=
True
):
crawled_data
:
list
[
db
.
Record
],
commit_changes
:
bool
=
True
,
unique_names
=
True
):
"""
"""
This function applies several stages:
This function applies several stages:
1) Retrieve identifiables for all records in crawled_data.
1) Retrieve identifiables for all records in crawled_data.
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment