View Javadoc

1   /*
2    * Copyright 2015 Data Archiving and Networked Services (an institute of
3    * Koninklijke Nederlandse Akademie van Wetenschappen), King's College London,
4    * Georg-August-Universitaet Goettingen Stiftung Oeffentlichen Rechts
5    *
6    * Licensed under the EUPL, Version 1.1 or – as soon they will be approved by
7    * the European Commission - subsequent versions of the EUPL (the "Licence");
8    * You may not use this work except in compliance with the Licence.
9    * You may obtain a copy of the Licence at:
10   *
11   * https://joinup.ec.europa.eu/software/page/eupl
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the Licence is distributed on an "AS IS" basis,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the Licence for the specific language governing
17   * permissions and limitations under the Licence.
18   */
19  
20  package eu.ehri.project.importers.ead;
21  
22  import com.google.common.base.Joiner;
23  import com.google.common.collect.ImmutableList;
24  import com.google.common.collect.ImmutableMap;
25  import com.google.common.collect.Lists;
26  import eu.ehri.project.definitions.Entities;
27  import eu.ehri.project.definitions.Ontology;
28  import eu.ehri.project.exceptions.ValidationError;
29  import eu.ehri.project.importers.base.ItemImporter;
30  import eu.ehri.project.importers.base.SaxXmlHandler;
31  import eu.ehri.project.importers.properties.XmlImportProperties;
32  import eu.ehri.project.importers.util.ImportHelpers;
33  import eu.ehri.project.models.DocumentaryUnit;
34  import eu.ehri.project.models.MaintenanceEvent;
35  import eu.ehri.project.models.MaintenanceEventType;
36  import eu.ehri.project.models.base.Entity;
37  import eu.ehri.project.persistence.Bundle;
38  import org.slf4j.Logger;
39  import org.slf4j.LoggerFactory;
40  import org.xml.sax.Attributes;
41  import org.xml.sax.SAXException;
42  
43  import java.util.ArrayList;
44  import java.util.List;
45  import java.util.Locale;
46  import java.util.Map;
47  import java.util.Stack;
48  import java.util.regex.Pattern;
49  
50  /**
51   * Handler of EAD files. Use to create a representation of the structure of Documentary Units.
52   * This generic handler does not do tricks to get data from any CHI-custom use of EAD - you
53   * should extend this class for that.
54   * If there is no language, it does set the language of the description to English.
55   * makes use of icaatom.properties with format: part/of/path/=attribute
56   */
57  public class EadHandler extends SaxXmlHandler {
58  
59      // Constants for elements we need to watch for.
60      static final String EADID = "eadid",
61              ARCHDESC = "archdesc",
62              DID = "did";
63  
64      // EAD file-level keys which are added to the data of the top-level
65      // archdesc element. Note: tag->property mappings must exist for these
66      // keys if the data is to be extracted.
67      private static final List<String> eadFileGlobals = ImmutableList.of(
68              "rulesAndConventions", "processInfo"
69      );
70  
71      private static final String DEFAULT_PROPERTIES = "ead2002.properties";
72  
73      private final List<Map<String, Object>> globalMaintenanceEvents = Lists.newArrayList();
74  
75      private final Map<String, Class<? extends Entity>> possibleSubNodes = ImmutableMap.of(
76              Entities.MAINTENANCE_EVENT, MaintenanceEvent.class
77      );
78  
79      private static final Logger logger = LoggerFactory.getLogger(EadHandler.class);
80  
81      @SuppressWarnings("unchecked")
82      protected final List<DocumentaryUnit>[] children = new ArrayList[12];
83  
84      /**
85       * Stack of identifiers of archival units. Push/pop the identifier of the current
86       * node on top/from the top of the stack.
87       */
88      private final Stack<String> scopeIds = new Stack<>();
89  
90      // Pattern for EAD nodes that represent a child item
91      private final static Pattern childItemPattern = Pattern.compile("^/*c(?:\\d*)$");
92  
93      /**
94       * Default language to use in units without language
95       */
96      private String eadLanguage = Locale.ENGLISH.getISO3Language();
97      private String eadId;
98  
99      /**
100      * Set a custom resolver so EAD DTDs are never looked up online.
101      */
102     @Override
103     public org.xml.sax.InputSource resolveEntity(String publicId, String systemId)
104             throws org.xml.sax.SAXException, java.io.IOException {
105         // This is the equivalent of returning a null dtd.
106         return new org.xml.sax.InputSource(new java.io.StringReader(""));
107     }
108 
109     /**
110      * Create an EadHandler using some importer. The default mapping of paths to node properties is used.
111      *
112      * @param importer the importer instance
113      */
114     public EadHandler(ItemImporter<Map<String, Object>, ?> importer) {
115         this(importer, new XmlImportProperties(DEFAULT_PROPERTIES));
116         logger.warn("Using default properties file: {}", DEFAULT_PROPERTIES);
117     }
118 
119     /**
120      * Create an EadHandler using some importer, and a mapping of paths to node properties.
121      *
122      * @param importer   the importer instance
123      * @param properties an XML node properties instance
124      */
125     public EadHandler(ItemImporter<Map<String, Object>, ?> importer,
126             XmlImportProperties properties) {
127         super(importer, properties);
128         children[depth] = Lists.newArrayList();
129     }
130 
131     @Override
132     public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
133         super.startElement(uri, localName, qName, attributes);
134 
135         if (isUnitDelimiter(qName)) { //a new DocumentaryUnit should be created
136             children[depth] = Lists.newArrayList();
137         }
138         if (qName.equals("profiledesc")) {
139             putPropertyInCurrentGraph(Ontology.MAINTENANCE_EVENT_TYPE, MaintenanceEventType.created.toString());
140         }
141         if (qName.equals("change")) {
142             putPropertyInCurrentGraph(Ontology.MAINTENANCE_EVENT_TYPE, MaintenanceEventType.updated.toString());
143         }
144     }
145 
146     /**
147      * Get the full 'path of identifiers' of the current node.
148      *
149      * @return a List of Strings, i.e. identifiers, representing the path of the current node
150      */
151     protected List<String> pathIds() {
152         if (scopeIds.isEmpty()) {
153             return scopeIds;
154         } else {
155             List<String> path = Lists.newArrayList();
156             for (int i = 0; i < scopeIds.size() - 1; i++) {
157                 path.add(scopeIds.get(i));
158             }
159             return path;
160         }
161 
162     }
163 
164     private String getCurrentTopIdentifier() {
165         Object current = currentGraphPath.peek().get(ImportHelpers.OBJECT_IDENTIFIER);
166         if (current instanceof List<?>) {
167             return (String) ((List) current).get(0);
168         } else {
169             return (String) current;
170         }
171     }
172 
173     /**
174      * Called when the XML parser encounters an end tag. This is tuned for EAD files, which come in many flavours.
175      * <p>
176      * Certain elements represent subcollections, for which we create new nodes (here, we create representative Maps for nodes).
177      * Many EAD producers use the standard in their own special way, so this method calls generalised methods to filter, get data
178      * in the right place and reformat.
179      * If a collection of EAD files need special treatment to get specific data in the right place, you only need to override the
180      * other methods (in order: extractIdentifier, extractTitle, extractDate).
181      */
182     @Override
183     public void endElement(String uri, String localName, String qName) throws SAXException {
184         //the child closes, add the new DocUnit to the list, establish some relations
185         super.endElement(uri, localName, qName);
186 
187         // If this is the <eadid> element, store its content
188 
189         if (qName.equals(EADID)) {
190             eadId = ((String) currentGraphPath.peek().get(Ontology.SOURCEFILE_KEY));
191             logger.debug("Found <{}>: {}", EADID, eadId);
192         }
193 
194         if (localName.equals("language") || qName.equals("language")) {
195             String lang = (String) currentGraphPath.peek().get("languageCode");
196             if (lang != null)
197                 eadLanguage = lang;
198         }
199 
200         // FIXME: We need to add the 'parent' identifier to the ID stack
201         // so that graph path IDs are created correctly. This currently
202         // assumes there's a 'did' element from which we extract this
203         // identifier.
204         if (qName.equals(DID)) {
205             extractIdentifier(currentGraphPath.peek());
206             String topId = getCurrentTopIdentifier();
207             scopeIds.push(topId);
208             logger.debug("Current id path: {}", scopeIds);
209         }
210 
211         if (needToCreateSubNode(qName)) {
212             Map<String, Object> currentGraph = currentGraphPath.pop();
213 
214             if (isUnitDelimiter(qName)) {
215                 try {
216                     //add any mandatory fields not yet there:
217                     // First: identifier(s),
218                     extractIdentifier(currentGraph);
219 
220                     // Second: title
221                     extractTitle(currentGraph);
222 
223                     useDefaultLanguage(currentGraph);
224 
225                     extractDate(currentGraph);
226 
227                     //add eadid as sourceFileId
228                     currentGraph.put(Ontology.SOURCEFILE_KEY, getSourceFileId());
229 
230                     //only on toplevel description:
231                     if (qName.equals(ARCHDESC)) {
232                         //add the <author> of the ead to the processInfo
233                         addGlobalValues(currentGraph, currentGraphPath.peek(), eadFileGlobals);
234                     }
235 
236                     if (!globalMaintenanceEvents.isEmpty() && !currentGraph.containsKey(Entities.MAINTENANCE_EVENT)) {
237                         logger.debug("Adding global maintenance events: {}", globalMaintenanceEvents);
238                         currentGraph.put(Entities.MAINTENANCE_EVENT, globalMaintenanceEvents);
239                     }
240 
241                     DocumentaryUnit current = (DocumentaryUnit) importer.importItem(currentGraph, pathIds());
242 
243                     logger.debug("importer used: {}", importer.getClass());
244                     if (depth > 0) { // if not on root level
245                         children[depth - 1].add(current); // add child to parent offspring
246                         // set the parent child relationships by hand
247                         // as we don't have the parent Documentary unit yet.
248                         // only when closing a DocUnit, one can set the relationship to its children,
249                         // but not its parent, as that has not yet been closed.
250                         for (DocumentaryUnit child : children[depth]) {
251                             if (child != null) {
252                                 current.addChild(child);
253                                 child.setPermissionScope(current);
254                             }
255                         }
256                     }
257                 } catch (ValidationError ex) {
258                     Bundle bundle = ex.getBundle();
259                     if (bundle.getId() == null) {
260                         // In order to indicate what has errored here if there's no
261                         // ID we need to create one with the line number reference.
262                         String path = pathIds().isEmpty() ? null : Joiner.on("/").join(pathIds());
263                         String ref = String.format("[Item completed prior to line: %d]",
264                                 locator.getLineNumber());
265                         String id = Joiner.on(" ").skipNulls().join(path, locator.getSystemId(), ref);
266                         importer.handleError(new ValidationError(bundle.withId(id), ex.getErrorSet()));
267                     } else {
268                         importer.handleError(ex);
269                     }
270                 } finally {
271                     depth--;
272                     scopeIds.pop();
273                 }
274             } else {
275                 // import the MaintenanceEvent
276                 if (getMappedProperty(currentPath).equals(Entities.MAINTENANCE_EVENT)
277                         && (qName.equals("profiledesc") || qName.equals("change"))) {
278                     Map<String, Object> me = ImportHelpers.getSubNode(currentGraph);
279                     me.put("order", globalMaintenanceEvents.size());
280                     globalMaintenanceEvents.add(me);
281                 }
282                 putSubGraphInCurrentGraph(getMappedProperty(currentPath), currentGraph);
283                 depth--;
284             }
285         }
286 
287         currentPath.pop();
288         if (currentPath.isEmpty()) {
289             currentGraphPath.pop();
290         }
291 
292     }
293 
294     /**
295      * @return the <code>&lt;eadid&gt;</code>, extended with the languageTag or null if it was not parsed yet or empty
296      */
297     protected String getSourceFileId() {
298         if (eadId == null) {
299             logger.error("EADID not set yet, or not given in eadfile");
300             return null;
301         } else {
302             String suffix = "#" + eadLanguage.toUpperCase();
303             if (eadId.toUpperCase().endsWith(suffix)) {
304                 return eadId;
305             }
306             return eadId + suffix;
307         }
308     }
309 
310     /**
311      * Checks given currentGraph for a language and sets a default language code
312      * for the description if no language is found.
313      *
314      * @param currentGraph Data at the current node level
315      */
316     protected void useDefaultLanguage(Map<String, Object> currentGraph) {
317         useDefaultLanguage(currentGraph, eadLanguage);
318     }
319 
320     /**
321      * Checks given currentGraph for a language and sets a default language code
322      * for the description if no language is found.
323      *
324      * @param currentGraph    Data at the current node level
325      * @param defaultLanguage Language code to use as default
326      */
327     private void useDefaultLanguage(Map<String, Object> currentGraph, String defaultLanguage) {
328         if (!currentGraph.containsKey(Ontology.LANGUAGE_OF_DESCRIPTION)) {
329             logger.debug("Using default language code: {}", defaultLanguage);
330             currentGraph.put(Ontology.LANGUAGE_OF_DESCRIPTION, defaultLanguage);
331         }
332     }
333 
334     /**
335      * if no NAME_KEY is provided, use the IDENTIFIER_KEY
336      *
337      * @param currentGraph Data at the current node level
338      */
339     protected void extractTitle(Map<String, Object> currentGraph) {
340         if (!currentGraph.containsKey(Ontology.NAME_KEY)) {
341             logger.error("no name found, using identifier {}", currentGraph.get(ImportHelpers.OBJECT_IDENTIFIER));
342             currentGraph.put(Ontology.NAME_KEY, currentGraph.get(ImportHelpers.OBJECT_IDENTIFIER));
343         }
344     }
345 
346     /**
347      * Handler-specific code for extraction or generation of unit dates.
348      * Default method is empty; override when necessary.
349      *
350      * @param currentGraph Data at the current node level
351      */
352     @SuppressWarnings("unused")
353     private void extractDate(Map<String, Object> currentGraph) {
354     }
355 
356     /**
357      * Handler-specific code for extraction or generation of unit IDs.
358      * Default method is empty; override when necessary.
359      *
360      * @param currentGraph Data at the current node level
361      */
362     protected void extractIdentifier(Map<String, Object> currentGraph) {
363         // If there are multiple identifiers at this point, take the
364         // first and add the rest as alternate identifiers...
365         if (currentGraph.containsKey(ImportHelpers.OBJECT_IDENTIFIER)) {
366             Object idents = currentGraph.get(ImportHelpers.OBJECT_IDENTIFIER);
367             if (idents instanceof List) {
368                 List identList = (List) idents;
369                 currentGraph.put(ImportHelpers.OBJECT_IDENTIFIER, identList.get(0));
370                 for (Object item : identList.subList(1, identList.size())) {
371                     addOtherIdentifier(currentGraph, ((String) item));
372                 }
373             }
374         }
375     }
376 
377     /**
378      * Helper method to add identifiers to the list of other identifiers.
379      * The property named Ontology.OTHER_IDENTIFIERS (i.e. "otherIdentifiers")
380      * is always an ArrayList of Strings.
381      *
382      * @param currentGraph    the node representation to add the otherIdentifier to
383      * @param otherIdentifier the alternative identifier to add
384      */
385     protected void addOtherIdentifier(Map<String, Object> currentGraph, String otherIdentifier) {
386         if (currentGraph.containsKey(Ontology.OTHER_IDENTIFIERS)) {
387             logger.debug("adding alternative id: {}", otherIdentifier);
388             Object oids = currentGraph.get(Ontology.OTHER_IDENTIFIERS);
389             if (oids instanceof List) {
390                 ((List<String>) oids).add(otherIdentifier);
391             } else {
392                 currentGraph.put(Ontology.OTHER_IDENTIFIERS,
393                         Lists.newArrayList(oids, otherIdentifier));
394             }
395         } else {
396             logger.debug("adding first alt id: {}", otherIdentifier);
397             currentGraph.put(Ontology.OTHER_IDENTIFIERS, Lists.newArrayList(otherIdentifier));
398         }
399     }
400 
401     @Override
402     protected boolean needToCreateSubNode(String qName) {
403         //child or parent unit:
404         boolean need = isUnitDelimiter(qName);
405         //controlAccess 
406         String path = getMappedProperty(currentPath);
407         if (path != null) {
408             need = need || path.endsWith("AccessPoint");
409         }
410         return need || possibleSubNodes.containsKey(getMappedProperty(currentPath));
411     }
412 
413     /**
414      * Determine if the element represents a unit delimiter
415      *
416      * @param elementName The XML element name
417      * @return Whether or not we're moved to a new item
418      */
419     private static boolean isUnitDelimiter(String elementName) {
420         return childItemPattern.matcher(elementName).matches() || elementName.equals(ARCHDESC);
421     }
422 
423     private void addGlobalValues(Map<String, Object> currentGraph, Map<String, Object> globalGraph, List<String> eadFileGlobals) {
424         for (String key : eadFileGlobals) {
425             ImportHelpers.putPropertyInGraph(currentGraph, key, ((String) globalGraph.get(key)));
426         }
427     }
428 }