View Javadoc

1   /*
2    * Copyright 2015 Data Archiving and Networked Services (an institute of
3    * Koninklijke Nederlandse Akademie van Wetenschappen), King's College London,
4    * Georg-August-Universitaet Goettingen Stiftung Oeffentlichen Rechts
5    *
6    * Licensed under the EUPL, Version 1.1 or – as soon they will be approved by
7    * the European Commission - subsequent versions of the EUPL (the "Licence");
8    * You may not use this work except in compliance with the Licence.
9    * You may obtain a copy of the Licence at:
10   *
11   * https://joinup.ec.europa.eu/software/page/eupl
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the Licence is distributed on an "AS IS" basis,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the Licence for the specific language governing
17   * permissions and limitations under the Licence.
18   */
19  
20  package eu.ehri.project.importers.ead;
21  
22  import com.google.common.collect.Lists;
23  import com.google.common.collect.Maps;
24  import com.tinkerpop.frames.FramedGraph;
25  import eu.ehri.project.definitions.Entities;
26  import eu.ehri.project.definitions.Ontology;
27  import eu.ehri.project.exceptions.ItemNotFound;
28  import eu.ehri.project.exceptions.SerializationError;
29  import eu.ehri.project.exceptions.ValidationError;
30  import eu.ehri.project.importers.ImportLog;
31  import eu.ehri.project.importers.base.AbstractImporter;
32  import eu.ehri.project.importers.links.LinkResolver;
33  import eu.ehri.project.importers.util.ImportHelpers;
34  import eu.ehri.project.models.AccessPointType;
35  import eu.ehri.project.models.DocumentaryUnit;
36  import eu.ehri.project.models.EntityClass;
37  import eu.ehri.project.models.Repository;
38  import eu.ehri.project.models.base.AbstractUnit;
39  import eu.ehri.project.models.base.Accessor;
40  import eu.ehri.project.models.base.Actioner;
41  import eu.ehri.project.models.base.PermissionScope;
42  import eu.ehri.project.persistence.Bundle;
43  import eu.ehri.project.persistence.BundleManager;
44  import eu.ehri.project.persistence.Messages;
45  import eu.ehri.project.persistence.Mutation;
46  import eu.ehri.project.persistence.Serializer;
47  import org.slf4j.Logger;
48  import org.slf4j.LoggerFactory;
49  
50  import java.util.List;
51  import java.util.Map;
52  import java.util.Stack;
53  import java.util.function.BiPredicate;
54  
55  /**
56   * Import EAD for a given repository into the database. Due to the laxness of the EAD standard this is a fairly complex
57   * procedure. An EAD a single entity at the highest level of description or multiple top-level entities, with or without
58   * a hierarchical structure describing their child items. This means that we need to recursively descend through the
59   * archdesc and c,c01-12 levels.
60   * <p>
61   * TODO: Extensive cleanups, optimisation, and rationalisation.
62   */
63  public class EadImporter extends AbstractImporter<Map<String, Object>, AbstractUnit> {
64  
65      private static final Logger logger = LoggerFactory.getLogger(EadImporter.class);
66      //the EadImporter can import ead as DocumentaryUnits, the default, or overwrite those and create VirtualUnits instead.
67      private final EntityClass unitEntity = EntityClass.DOCUMENTARY_UNIT;
68      private final Serializer mergeSerializer;
69      private final LinkResolver linkResolver;
70  
71      public static final String ACCESS_POINT = "AccessPoint";
72  
73      /**
74       * Construct an EadImporter object.
75       *
76       * @param graph           the framed graph
77       * @param permissionScope the permission scope
78       * @param log             the log
79       */
80      public EadImporter(FramedGraph<?> graph, PermissionScope permissionScope, Actioner actioner, ImportLog log) {
81          super(graph, permissionScope, actioner, log);
82          mergeSerializer = new Serializer.Builder(graph).dependentOnly().build();
83          linkResolver = new LinkResolver(graph, actioner.as(Accessor.class));
84  
85      }
86  
87      /**
88       * Import a single archdesc or c01-12 item, keeping a reference to the hierarchical depth.
89       *
90       * @param itemData The raw data map
91       * @param idPath   The identifiers of parent documents,
92       *                 not including those of the overall permission scope
93       * @throws ValidationError when the itemData does not contain an identifier for the unit or...
94       */
95      @Override
96      public AbstractUnit importItem(Map<String, Object> itemData, List<String> idPath)
97              throws ValidationError {
98  
99          BundleManager persister = getPersister(idPath);
100 
101         Bundle description = getDescription(itemData);
102 
103         // extractIdentifiers does not throw ValidationError on missing ID
104         Bundle unit = Bundle.of(unitEntity, ImportHelpers.extractIdentifiers(itemData));
105 
106         // Check for missing identifier, throw an exception when there is no ID.
107         if (unit.getDataValue(Ontology.IDENTIFIER_KEY) == null) {
108             throw new ValidationError(unit, Ontology.IDENTIFIER_KEY,
109                     Messages.getString("BundleValidator.missingField"));
110         }
111 
112         Mutation<DocumentaryUnit> mutation =
113                 persister.createOrUpdate(mergeWithPreviousAndSave(unit,
114                         description, idPath), DocumentaryUnit.class);
115         logger.debug("Imported item: {}", itemData.get("name"));
116         DocumentaryUnit frame = mutation.getNode();
117 
118         // Set the repository/item relationship
119         if (idPath.isEmpty() && mutation.created()) {
120             EntityClass scopeType = manager.getEntityClass(permissionScope);
121             if (scopeType.equals(EntityClass.REPOSITORY)) {
122                 Repository repository = framedGraph.frame(permissionScope.asVertex(), Repository.class);
123                 frame.setRepository(repository);
124                 frame.setPermissionScope(repository);
125             } else if (scopeType.equals(unitEntity)) {
126                 DocumentaryUnit parent = framedGraph.frame(permissionScope.asVertex(), DocumentaryUnit.class);
127                 parent.addChild(frame);
128                 frame.setPermissionScope(parent);
129             } else {
130                 logger.error("Unknown scope type for documentary unit: {}", scopeType);
131             }
132         }
133 
134         handleCallbacks(mutation);
135         linkResolver.solveUndeterminedRelationships(frame);
136 
137         return frame;
138     }
139 
140     /**
141      * Extract the documentary unit description bundle from the raw map data.
142      * <p>
143      * Note: the itemData map is mutable and should be considered an out parameter.
144      *
145      * @param itemData the raw data map
146      * @return a description bundle
147      */
148     protected Bundle getDescription(Map<String, Object> itemData) throws ValidationError {
149         List<Map<String, Object>> extractedDates = ImportHelpers.extractDates(itemData);
150 
151         Map<String, Object> raw = ImportHelpers.extractDescription(itemData, EntityClass.DOCUMENTARY_UNIT_DESCRIPTION);
152 
153         Bundle.Builder descBuilder = Bundle.Builder.withClass(EntityClass.DOCUMENTARY_UNIT_DESCRIPTION)
154                 .addData(raw);
155 
156         // Add dates and descriptions to the bundle since they're @Dependent
157         // relations.
158         for (Map<String, Object> dpb : extractedDates) {
159             descBuilder.addRelation(Ontology.ENTITY_HAS_DATE, Bundle.of(EntityClass.DATE_PERIOD, dpb));
160         }
161 
162         for (Map<String, Object> rel : extractRelations(itemData)) {//, (String) unit.getErrors().get(Identifiable.IDENTIFIER_KEY)
163             logger.debug("relation found: {}", rel.get(Ontology.NAME_KEY));
164             descBuilder.addRelation(Ontology.HAS_ACCESS_POINT, Bundle.of(EntityClass.ACCESS_POINT, rel));
165         }
166 
167         for (Map<String, Object> dpb : ImportHelpers.extractSubNodes(Entities.MAINTENANCE_EVENT, itemData)) {
168             logger.debug("maintenance event found {}", dpb);
169             //dates in maintenanceEvents are no DatePeriods, they are not something to search on
170             descBuilder.addRelation(Ontology.HAS_MAINTENANCE_EVENT,
171                     Bundle.of(EntityClass.MAINTENANCE_EVENT, dpb));
172         }
173 
174         Map<String, Object> unknowns = ImportHelpers.extractUnknownProperties(itemData);
175         if (!unknowns.isEmpty()) {
176             StringBuilder unknownProperties = new StringBuilder();
177             for (String u : unknowns.keySet()) {
178                 unknownProperties.append(u);
179             }
180             logger.debug("Unknown Properties found: {}", unknownProperties);
181             descBuilder.addRelation(Ontology.HAS_UNKNOWN_PROPERTY,
182                     Bundle.of(EntityClass.UNKNOWN_PROPERTY, unknowns));
183         }
184 
185         // Set the description identifier same as the source file ID,
186         // which together with the lang code should form a unique
187         // identifier within the item
188         descBuilder.addDataValue(Ontology.IDENTIFIER_KEY, raw.get(Ontology.SOURCEFILE_KEY));
189         return descBuilder.build();
190     }
191 
192     /**
193      * Finds any bundle in the graph with the same ObjectIdentifier.
194      * If there is no bundle with this identifier, it is created.
195      * If it exists and a Description in the given language exists from the same source file,
196      * the description is replaced. If the description is from another source, it is added to the
197      * bundle's descriptions.
198      *
199      * @param unit       the DocumentaryUnit to be saved
200      * @param descBundle the documentsDescription to replace any previous ones with this language
201      * @param idPath     the ID path of this bundle (will be relative to the ID path of the permission scope)
202      * @return A bundle with description relationships merged.
203      */
204     protected Bundle mergeWithPreviousAndSave(Bundle unit, Bundle descBundle, List<String> idPath) throws ValidationError {
205         final String languageOfDesc = descBundle.getDataValue(Ontology.LANGUAGE_OF_DESCRIPTION);
206         final String thisSourceFileId = descBundle.getDataValue(Ontology.SOURCEFILE_KEY);
207 
208         logger.debug("merging: descBundle's language = {}, sourceFileId = {}",
209                 languageOfDesc, thisSourceFileId);
210         /*
211          * for some reason, the idpath from the permissionscope does not contain the parent documentary unit.
212          * TODO: so for now, it is added manually
213          */
214         List<String> itemIdPath = Lists.newArrayList(getPermissionScope().idPath());
215         itemIdPath.addAll(idPath);
216 
217         Bundle unitWithIds = unit.generateIds(itemIdPath);
218         logger.debug("merging: docUnit's graph id = {}", unitWithIds.getId());
219         // If the bundle exists, we merge
220         if (manager.exists(unitWithIds.getId())) {
221             try {
222                 // read the current item’s bundle
223                 Bundle oldBundle = mergeSerializer
224                         .vertexToBundle(manager.getVertex(unitWithIds.getId()));
225 
226                 // filter out dependents that a) are descriptions, b) have the same language/code,
227                 // and c) have the same source file ID
228                 BiPredicate<String, Bundle> filter = (relationLabel, bundle) -> {
229                     String lang = bundle.getDataValue(Ontology.LANGUAGE);
230                     String oldSourceFileId = bundle.getDataValue(Ontology.SOURCEFILE_KEY);
231                     return relationLabel.equals(Ontology.DESCRIPTION_FOR_ENTITY)
232                             && bundle.getType().equals(EntityClass.DOCUMENTARY_UNIT_DESCRIPTION)
233                             && (lang != null && lang.equals(languageOfDesc))
234                             && (oldSourceFileId != null && oldSourceFileId.equals(thisSourceFileId));
235                 };
236                 Bundle filtered = oldBundle.filterRelations(filter);
237 
238                 return unitWithIds.withRelations(filtered.getRelations())
239                         .withRelation(Ontology.DESCRIPTION_FOR_ENTITY, descBundle);
240             } catch (SerializationError ex) {
241                 throw new ValidationError(unit, "serialization error", ex.getMessage());
242             } catch (ItemNotFound ex) {
243                 throw new ValidationError(unit, "item not found exception", ex.getMessage());
244             }
245         } else { // else we create a new bundle.
246             return unit.withRelation(Ontology.DESCRIPTION_FOR_ENTITY, descBundle);
247         }
248     }
249 
250     @SuppressWarnings("unchecked")
251     protected Iterable<Map<String, Object>> extractRelations(Map<String, Object> data) {
252         List<Map<String, Object>> list = Lists.newArrayList();
253         for (String key : data.keySet()) {
254             if (key.equals(Entities.ACCESS_POINT)) {
255                 //name identifier
256                 for (Map<String, Object> origRelation : (List<Map<String, Object>>) data.get(key)) {
257                     Map<String, Object> relationNode = Maps.newHashMap();
258                     if (origRelation.containsKey("type")) {
259                         //try to find the original identifier
260                         relationNode.put(ImportHelpers.LINK_TARGET, origRelation.get("concept"));
261                         //try to find the original name
262                         relationNode.put(Ontology.NAME_KEY, origRelation.get("name"));
263                         relationNode.put("cvoc", origRelation.get("cvoc"));
264                         relationNode.put(Ontology.ACCESS_POINT_TYPE, origRelation.get("type"));
265                     } else {
266                         relationNode.put(Ontology.NAME_KEY, origRelation.get(Entities.ACCESS_POINT));
267                     }
268                     if (!relationNode.containsKey(Ontology.ACCESS_POINT_TYPE)) {
269                         logger.debug("relationNode without type: {}", relationNode.get(Ontology.NAME_KEY));
270                         relationNode.put(Ontology.ACCESS_POINT_TYPE, AccessPointType.corporateBody);
271                     }
272                     list.add(relationNode);
273                 }
274             } else if (key.endsWith(ACCESS_POINT)) {
275 
276                 if (data.get(key) instanceof List) {
277                     //type, targetUrl, targetName, notes
278                     for (Map<String, Object> origRelation : (List<Map<String, Object>>) data.get(key)) {
279                         if (origRelation.isEmpty()) {
280                             break;
281                         }
282                         Map<String, Object> relationNode = Maps.newHashMap();
283                         for (String eventkey : origRelation.keySet()) {
284                             if (eventkey.endsWith(ACCESS_POINT)) {
285                                 relationNode.put(Ontology.ACCESS_POINT_TYPE,
286                                         eventkey.substring(0, eventkey.indexOf(ACCESS_POINT)));
287                                 relationNode.put(Ontology.NAME_KEY, origRelation.get(eventkey));
288                             } else {
289                                 relationNode.put(eventkey, origRelation.get(eventkey));
290                             }
291                         }
292                         if (!relationNode.containsKey(Ontology.ACCESS_POINT_TYPE)) {
293                             relationNode.put(Ontology.ACCESS_POINT_TYPE, AccessPointType.corporateBody);
294                         }
295                         //if no name is given, it was apparently an empty <controlaccess> tag?
296                         if (relationNode.containsKey(Ontology.NAME_KEY)) {
297                             list.add(relationNode);
298                         }
299                     }
300                 } else {
301                     Map<String, Object> relationNode = Maps.newHashMap();
302                     relationNode.put(Ontology.ACCESS_POINT_TYPE,
303                             key.substring(0, key.indexOf(ACCESS_POINT)));
304                     relationNode.put(Ontology.NAME_KEY, data.get(key));
305                     list.add(relationNode);
306                 }
307             }
308         }
309         return list;
310     }
311 
312     @Override
313     public AbstractUnit importItem(Map<String, Object> itemData) throws ValidationError {
314         return importItem(itemData, new Stack<>());
315     }
316 }