View Javadoc

1   /*
2    * Copyright 2015 Data Archiving and Networked Services (an institute of
3    * Koninklijke Nederlandse Akademie van Wetenschappen), King's College London,
4    * Georg-August-Universitaet Goettingen Stiftung Oeffentlichen Rechts
5    *
6    * Licensed under the EUPL, Version 1.1 or – as soon they will be approved by
7    * the European Commission - subsequent versions of the EUPL (the "Licence");
8    * You may not use this work except in compliance with the Licence.
9    * You may obtain a copy of the Licence at:
10   *
11   * https://joinup.ec.europa.eu/software/page/eupl
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the Licence is distributed on an "AS IS" basis,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the Licence for the specific language governing
17   * permissions and limitations under the Licence.
18   */
19  
20  package eu.ehri.project.importers.ead;
21  
22  import com.google.common.collect.ImmutableMap;
23  import com.google.common.collect.Lists;
24  import eu.ehri.project.definitions.Entities;
25  import eu.ehri.project.definitions.Ontology;
26  import eu.ehri.project.exceptions.ValidationError;
27  import eu.ehri.project.importers.base.ItemImporter;
28  import eu.ehri.project.importers.base.SaxXmlHandler;
29  import eu.ehri.project.importers.properties.XmlImportProperties;
30  import eu.ehri.project.importers.util.ImportHelpers;
31  import eu.ehri.project.models.DocumentaryUnit;
32  import eu.ehri.project.models.MaintenanceEvent;
33  import eu.ehri.project.models.VirtualUnit;
34  import eu.ehri.project.models.base.AbstractUnit;
35  import eu.ehri.project.models.base.Entity;
36  import org.slf4j.Logger;
37  import org.slf4j.LoggerFactory;
38  import org.xml.sax.Attributes;
39  import org.xml.sax.SAXException;
40  
41  import java.util.ArrayList;
42  import java.util.List;
43  import java.util.Locale;
44  import java.util.Map;
45  import java.util.Stack;
46  import java.util.regex.Pattern;
47  
48  /**
49   * Handler of Virtual EAD files.
50   *
51   * TODO: Clean up and merge with regular EadHandler
52   */
53  public class VirtualEadHandler extends SaxXmlHandler {
54      private static final String AUTHOR = "authors",
55              SOURCEFILEID = "sourceFileId";
56  
57      private final List<Map<String, Object>> globalMaintenanceEvents = Lists.newArrayList();
58  
59      private final ImmutableMap<String, Class<? extends Entity>> possibleSubNodes = ImmutableMap.<String, Class<? extends Entity>>of(
60              Entities.MAINTENANCE_EVENT, MaintenanceEvent.class
61      );
62  
63      private static final Logger logger = LoggerFactory
64              .getLogger(VirtualEadHandler.class);
65  
66      protected final List<AbstractUnit>[] children = new ArrayList[12];
67      private final Stack<String> scopeIds = new Stack<>();
68      // Pattern for EAD nodes that represent a child item
69      private final static Pattern childItemPattern = Pattern.compile("^/*c(?:\\d*)$");
70  
71      // Constants for elements we need to watch for.
72      private final static String ARCHDESC = "archdesc";
73      private final static String DID = "did";
74      /**
75       * used to attach the MaintenanceEvents to
76       */
77      private VirtualUnit topLevel;
78  
79      /**
80       * Default language to use in units without language
81       */
82      protected String eadLanguage = Locale.ENGLISH.getISO3Language();
83  
84      /**
85       * EAD identifier as found in <code>&lt;eadid&gt;</code> in the currently handled EAD file
86       */
87      private String eadId;
88      private String author;
89  
90      /**
91       * Set a custom resolver so EAD DTDs are never looked up online.
92       */
93      @Override
94      public org.xml.sax.InputSource resolveEntity(String publicId, String systemId)
95              throws org.xml.sax.SAXException, java.io.IOException {
96          // This is the equivalent of returning a null dtd.
97          return new org.xml.sax.InputSource(new java.io.StringReader(""));
98      }
99  
100     /**
101      * Create an EadHandler using some importer. The default mapping of paths to node properties is used.
102      *
103      * @param importer
104      */
105     @SuppressWarnings("unchecked")
106     public VirtualEadHandler(ItemImporter<Map<String, Object>, ?> importer) {
107         this(importer, new XmlImportProperties("vc.properties"));
108         logger.warn("vc.properties used");
109     }
110 
111     /**
112      * Create an EadHandler using some importer, and a mapping of paths to node properties.
113      *
114      * @param importer
115      * @param xmlImportProperties
116      */
117     public VirtualEadHandler(ItemImporter<Map<String, Object>, ?> importer,
118             XmlImportProperties xmlImportProperties) {
119         super(importer, xmlImportProperties);
120         children[depth] = Lists.newArrayList();
121     }
122 
123     @Override
124     public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
125         super.startElement(uri, localName, qName, attributes);
126 
127         if (isUnitDelimiter(qName)) { //a new Unit should be created
128             children[depth] = Lists.newArrayList();
129         }
130     }
131 
132     protected List<String> pathIds() {
133         if (scopeIds.isEmpty()) {
134             return scopeIds;
135         } else {
136             List<String> path = Lists.newArrayList();
137             for (int i = 0; i < scopeIds.size() - 1; i++) {
138                 path.add(scopeIds.get(i));
139             }
140             return path;
141         }
142 
143     }
144 
145     private String getCurrentTopIdentifier() {
146         Object current = currentGraphPath.peek().get(ImportHelpers.OBJECT_IDENTIFIER);
147         if (current instanceof List<?>) {
148             return (String) ((List) current).get(0);
149         } else {
150             return (String) current;
151         }
152     }
153 
154     /**
155      * Called when the XML parser encounters an end tag. This is tuned for EAD files, which come in many flavours.
156      * <p>
157      * Certain elements represent subcollections, for which we create new nodes (here, we create representative Maps for nodes).
158      * Many EAD producers use the standard in their own special way, so this method calls generalised methods to filter, get data
159      * in the right place and reformat.
160      * If a collection of EAD files need special treatment to get specific data in the right place, you only need to override the
161      * other methods (in order: extractIdentifier, extractTitle, extractDate).
162      */
163     @Override
164     public void endElement(String uri, String localName, String qName) throws SAXException {
165         //the child closes, add the new DocUnit to the list, establish some relations
166         super.endElement(uri, localName, qName);
167 
168         // If this is the <eadid> element, store its content
169 //    	logger.debug("localName: " + localName + ", qName: " + qName);
170         if (localName.equals("eadid") || qName.equals("eadid")) {
171             eadId = (String) currentGraphPath.peek().get(SOURCEFILEID);
172             logger.debug("Found <eadid>: " + eadId);
173         } else if (localName.equals("author") || qName.equals("author")) {
174             author = (String) currentGraphPath.peek().get(AUTHOR);
175             logger.debug("Found <author>: " + author);
176         }
177 
178         if (localName.equals("language") || qName.equals("language")) {
179             String lang = (String) currentGraphPath.peek().get("languageCode");
180             if (lang != null)
181                 eadLanguage = lang;
182         }
183 
184         // FIXME: We need to add the 'parent' identifier to the ID stack
185         // so that graph path IDs are created correctly. This currently
186         // assumes there's a 'did' element from which we extract this
187         // identifier.
188         if (qName.equals(DID)) {
189             extractIdentifier(currentGraphPath.peek());
190             String topId = getCurrentTopIdentifier();
191             scopeIds.push(topId);
192             logger.debug("Current id path: " + scopeIds);
193         }
194 
195         if (needToCreateSubNode(qName)) {
196             Map<String, Object> currentGraph = currentGraphPath.pop();
197 
198             if (isUnitDelimiter(qName)) {
199                 try {
200                     //add any mandatory fields not yet there:
201                     // First: identifier(s),
202                     extractIdentifier(currentGraph);
203 
204                     // Second: title
205                     extractTitle(currentGraph);
206 
207                     useDefaultLanguage(currentGraph);
208 
209                     extractDate(currentGraph);
210 
211                     currentGraph.put(SOURCEFILEID, getSourceFileId());
212 
213                     //add the <author> of the ead to every description
214                     addAuthor(currentGraph);
215 
216                     if (!globalMaintenanceEvents.isEmpty() && !currentGraph.containsKey(Entities.MAINTENANCE_EVENT)) {
217                         logger.debug("Adding global maintenance events: {}", globalMaintenanceEvents);
218                         currentGraph.put(Entities.MAINTENANCE_EVENT, globalMaintenanceEvents);
219                     }
220 
221                     AbstractUnit current = (AbstractUnit) importer.importItem(currentGraph, pathIds());
222 
223                     if (current.getType().equals(Entities.VIRTUAL_UNIT)) {
224                         logger.debug("virtual unit created: " + current.getIdentifier());
225                         topLevel = (VirtualUnit) current; // if it is not overwritten, the current DU is the topLevel
226                         logger.debug("importer used: " + importer.getClass());
227                         if (depth > 0) { // if not on root level
228                             children[depth - 1].add(current); // add child to parent offspring
229                             // set the parent child relationships by hand
230                             // as we don't have the parent Documentary unit yet.
231                             // only when closing a DocUnit, one can set the relationship to its children,
232                             // but not its parent, as that has not yet been closed.
233                             for (AbstractUnit child : children[depth]) {
234                                 if (child != null) {
235                                     if (child.getType().equals(Entities.VIRTUAL_UNIT)) {
236                                         logger.debug("virtual child");
237 
238                                         ((VirtualUnit) current).addChild(((VirtualUnit) child));
239                                         child.setPermissionScope(current);
240                                     } else { //child.getType().equals(Entities.DOCUMENTARY_UNIT)
241                                         logger.debug("documentary child");
242                                         ((VirtualUnit) current).addIncludedUnit(((DocumentaryUnit) child));
243 
244                                     }
245                                 }
246                             }
247                         }
248                     } else {
249                         //nothing has to happen, since the DocumentaryUnit is already created before
250                         logger.debug("documentary Unit found: " + current.getIdentifier());
251                         if (depth > 0) { // if not on root level
252                             children[depth - 1].add(current); // add child to parent offspring
253                         }
254                     }
255                 } catch (ValidationError ex) {
256                     logger.error("caught validation error: " + ex.getMessage());
257                 } finally {
258                     depth--;
259                     scopeIds.pop();
260                 }
261             } else {
262                 //import the MaintenanceEvent
263                 if (getMappedProperty(currentPath).equals(Entities.MAINTENANCE_EVENT)
264                         && (qName.equals("profiledesc") || qName.equals("change"))) {
265                     Map<String, Object> me = ImportHelpers.getSubNode(currentGraph);
266                     me.put("order", globalMaintenanceEvents.size());
267                     globalMaintenanceEvents.add(me);
268                 }
269 
270                 putSubGraphInCurrentGraph(getMappedProperty(currentPath), currentGraph);
271                 depth--;
272             }
273         }
274 
275         currentPath.pop();
276         if (currentPath.isEmpty()) {
277             currentGraphPath.pop();
278         }
279     }
280 
281     protected String getSourceFileId() {
282         if (getEadId().toLowerCase().endsWith("#" + getDefaultLanguage().toLowerCase())) {
283             return getEadId();
284         }
285         return getEadId() + "#" + getDefaultLanguage().toUpperCase();
286     }
287 
288     /**
289      * Get the EAD identifier of the EAD being imported
290      *
291      * @return the <code>&lt;eadid&gt;</code> or null if it was not parsed yet or empty
292      */
293     protected String getEadId() {
294         if (eadId == null)
295             logger.error("eadid not set yet or empty");
296         return eadId;
297     }
298 
299     protected String getAuthor() {
300         return author;
301     }
302 
303     /**
304      * Checks given currentGraph for a language and sets a default language code
305      * for the description if no language is found.
306      *
307      * @param currentGraph Data at the current node level
308      */
309     protected void useDefaultLanguage(Map<String, Object> currentGraph) {
310         useDefaultLanguage(currentGraph, getDefaultLanguage());
311     }
312 
313     /**
314      * Checks given currentGraph for a language and sets a default language code
315      * for the description if no language is found.
316      *
317      * @param currentGraph    Data at the current node level
318      * @param defaultLanguage Language code to use as default
319      */
320     protected void useDefaultLanguage(Map<String, Object> currentGraph, String defaultLanguage) {
321 
322         if (!currentGraph.containsKey(Ontology.LANGUAGE_OF_DESCRIPTION)) {
323             logger.debug("Using default language code: " + defaultLanguage);
324             currentGraph.put(Ontology.LANGUAGE_OF_DESCRIPTION, defaultLanguage);
325         }
326     }
327 
328     protected String getDefaultLanguage() {
329         return eadLanguage;
330     }
331 
332     /**
333      * Handler-specific code for extraction or generation of unit titles.
334      * Default method is empty; override when necessary.
335      *
336      * @param currentGraph Data at the current node level
337      */
338     protected void extractTitle(Map<String, Object> currentGraph) {
339 
340     }
341 
342     /**
343      * Handler-specific code for extraction or generation of unit dates.
344      * Default method is empty; override when necessary.
345      *
346      * @param currentGraph Data at the current node level
347      */
348     protected void extractDate(Map<String, Object> currentGraph) {
349 
350     }
351 
352     /**
353      * Handler-specific code for extraction or generation of unit IDs.
354      * Default method is empty; override when necessary.
355      *
356      * @param currentGraph Data at the current node level
357      */
358     protected void extractIdentifier(Map<String, Object> currentGraph) {
359 
360     }
361 
362     /**
363      * Helper method to add identifiers to the list of other identifiers.
364      * The property named Ontology.OTHER_IDENTIFIERS (i.e. "otherIdentifiers")
365      * is always an ArrayList of Strings.
366      *
367      * @param currentGraph    the node representation to add the otherIdentifier to
368      * @param otherIdentifier the alternative identifier to add
369      */
370     protected void addOtherIdentifier(Map<String, Object> currentGraph, String otherIdentifier) {
371         if (currentGraph.containsKey(Ontology.OTHER_IDENTIFIERS)) {
372             logger.debug("adding alternative id: " + otherIdentifier);
373             Object oids = currentGraph.get(Ontology.OTHER_IDENTIFIERS);
374             if (oids != null && oids instanceof ArrayList<?>) {
375                 ((ArrayList<String>) oids).add(otherIdentifier);
376                 logger.debug("alternative ID added");
377             }
378         } else {
379             logger.debug("adding first alt id: " + otherIdentifier);
380             List<String> oids = Lists.newArrayList();
381             oids.add(otherIdentifier);
382             currentGraph.put(Ontology.OTHER_IDENTIFIERS, oids);
383         }
384     }
385 
386     @Override
387     protected boolean needToCreateSubNode(String qName) {
388         //child or parent unit:
389         boolean need = isUnitDelimiter(qName);
390         //controlAccess 
391         String path = getMappedProperty(currentPath);
392         if (path != null) {
393             need = need || path.endsWith("AccessPoint");
394         }
395         return need || possibleSubNodes.containsKey(getMappedProperty(currentPath));
396     }
397 
398     /**
399      * Determine if the element represents a unit delimiter
400      *
401      * @param elementName The XML element name
402      * @return Whether or not we're moved to a new item
403      */
404     protected static boolean isUnitDelimiter(String elementName) {
405         return childItemPattern.matcher(elementName).matches() || elementName.equals(ARCHDESC);
406     }
407 
408     private void addAuthor(Map<String, Object> currentGraph) {
409         if (getAuthor() != null && !currentGraph.containsKey(AUTHOR)) {
410             currentGraph.put(AUTHOR, getAuthor());
411         }
412     }
413 }