View Javadoc

1   /*
2    * Copyright 2015 Data Archiving and Networked Services (an institute of
3    * Koninklijke Nederlandse Akademie van Wetenschappen), King's College London,
4    * Georg-August-Universitaet Goettingen Stiftung Oeffentlichen Rechts
5    *
6    * Licensed under the EUPL, Version 1.1 or – as soon they will be approved by
7    * the European Commission - subsequent versions of the EUPL (the "Licence");
8    * You may not use this work except in compliance with the Licence.
9    * You may obtain a copy of the Licence at:
10   *
11   * https://joinup.ec.europa.eu/software/page/eupl
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the Licence is distributed on an "AS IS" basis,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the Licence for the specific language governing
17   * permissions and limitations under the Licence.
18   */
19  
20  package eu.ehri.project.importers.base;
21  
22  import com.google.common.collect.Lists;
23  import com.google.common.collect.Maps;
24  import eu.ehri.project.importers.properties.XmlImportProperties;
25  import eu.ehri.project.importers.util.ImportHelpers;
26  import org.slf4j.Logger;
27  import org.slf4j.LoggerFactory;
28  import org.xml.sax.Attributes;
29  import org.xml.sax.ContentHandler;
30  import org.xml.sax.Locator;
31  import org.xml.sax.SAXException;
32  import org.xml.sax.ext.LexicalHandler;
33  import org.xml.sax.helpers.DefaultHandler;
34  
35  import java.util.List;
36  import java.util.Map;
37  import java.util.Optional;
38  import java.util.Stack;
39  
40  import static eu.ehri.project.definitions.Ontology.LANGUAGE_OF_DESCRIPTION;
41  
42  /**
43   * Reader of XML files, creator of {@link Map}-based representations of EA* files.
44   * Makes use of properties file with format:
45   * <p>
46   * path/within/xml/=node/property
47   * <p>
48   * if no &lt;node&gt; is given, it is the default logical-unit or unit-description of this property file.
49   * with eac.properties this would be an HistoricalAgent with an HistoricalAgentDescription
50   * if there is a &lt;node&gt; given, it will translate to another graph node, like Address.
51   * <p>
52   * lines starting with '{@literal @}' give the attributes:
53   * <code>{@literal @}attribute=tmpname
54   * path/within/xml/@tmpname=node/property</code>
55   * <p>
56   * all tags not included in the properties file that have a  nodevalue will be put in a unknownproperties node,
57   * with an edge to the unit-description.
58   */
59  public abstract class SaxXmlHandler extends DefaultHandler implements LexicalHandler, ContentHandler {
60  
61      private static final Logger logger = LoggerFactory.getLogger(SaxXmlHandler.class);
62  
63      protected Locator locator;
64  
65      /**
66       * Key in the node that denotes the object's identifier.
67       */
68      protected final Stack<Map<String, Object>> currentGraphPath = new Stack<>();
69      protected final Map<String, Map<String, Object>> languageMap = Maps.newHashMap();
70      protected final Stack<String> currentPath = new Stack<>();
71      protected final Stack<StringBuilder> currentText = new Stack<>();
72  
73      protected String currentEntity;
74  
75      protected final ItemImporter<Map<String, Object>, ?> importer;
76      protected final XmlImportProperties properties;
77  
78      protected int depth;
79      private String attribute;
80      private String languagePrefix;
81  
82      public SaxXmlHandler(ItemImporter<Map<String, Object>, ?> importer) {
83          this(importer, null);
84      }
85  
86      public SaxXmlHandler(ItemImporter<Map<String, Object>, ?> importer, XmlImportProperties properties) {
87          super();
88          this.importer = importer;
89          this.properties = properties;
90          currentGraphPath.push(Maps.<String, Object>newHashMap());
91      }
92  
93      /**
94       * Determines whether a new node that is a 'child' of the current node (i.e. a 'sub-node')
95       * needs to be created, based on the qualified element name.
96       *
97       * @param qName the QName
98       * @return true if the QName warrants a sub-node, false otherwise
99       */
100     protected abstract boolean needToCreateSubNode(String qName);
101 
102     @Override
103     public void startEntity(String name) {
104         currentEntity = name;
105     }
106 
107     @Override
108     public void endEntity(String name) {
109         currentEntity = null;
110     }
111 
112     @Override
113     public void startDTD(String name, String publicId, String systemId) {
114     }
115 
116     @Override
117     public void setDocumentLocator(Locator locator) {
118         this.locator = locator;
119     }
120 
121     @Override
122     public void endDTD() {
123     }
124 
125     @Override
126     public void comment(char[] ch, int start, int end) {
127     }
128 
129     @Override
130     public void startCDATA() {
131     }
132 
133     @Override
134     public void endCDATA() {
135     }
136 
137 
138     /**
139      * Receive an opening tag. Initialise the current text to store the characters,
140      * create a language map to hold descriptions in different languages,
141      * push the level if this element warrants a new sub-node and store the attributes
142      * that need to be stored.
143      */
144     @Override
145     public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
146         // initialise the text holding space
147         currentText.push(new StringBuilder());
148 
149         // retrieve the language from the attributes and
150         // create a language map
151         Optional<String> lang = languageAttribute(attributes);
152         if (lang.isPresent()) {
153             languagePrefix = lang.get();
154             if (!languageMap.containsKey(languagePrefix)) {
155                 if (languageMap.isEmpty()) {
156                     currentGraphPath.peek().put(LANGUAGE_OF_DESCRIPTION, languageMap);
157                 }
158                 Map<String, Object> m = Maps.newHashMap();
159                 m.put(LANGUAGE_OF_DESCRIPTION, languagePrefix);
160                 languageMap.put(languagePrefix, m);
161             }
162         }
163 
164         // Update the path with the new element name
165         currentPath.push(withoutNamespace(qName));
166         if (needToCreateSubNode(qName)) { //a new subgraph should be created
167             depth++;
168             logger.debug("Pushing depth... {} -> {}", depth, qName);
169             currentGraphPath.push(Maps.<String, Object>newHashMap());
170         }
171 
172         // Store attributes that are listed in the .properties file
173         for (int attr = 0; attr < attributes.getLength(); attr++) { // only certain attributes get stored
174             String attributeName = withoutNamespace(attributes.getQName(attr));
175             if (properties.hasAttributeProperty(attributeName)
176                     && !properties.getAttributeProperty(attributeName).equals(LANGUAGE_OF_DESCRIPTION)) {
177 
178                 if (isKeyInPropertyFile(currentPath, "@" + properties.getAttributeProperty(attributeName), "")) {
179                     String path = getMappedProperty(currentPath, "@" + properties.getAttributeProperty(attributeName), "");
180                     putPropertyInCurrentGraph(path, attributes.getValue(attr));
181                 } else if (isKeyInPropertyFile(currentPath, "@" + properties.getAttributeProperty(attributeName), "$" + attributes.getValue(attr))) {
182                     attribute = getMappedProperty(currentPath, "@" + properties.getAttributeProperty(attributeName), "$" + attributes.getValue(attr));
183                 } else {
184                     logger.debug("attribute {} not found in properties", attributeName);
185                 }
186             }
187         }
188     }
189 
190     /**
191      * Receive an end element. Put the contents of the text holding space in the current graph
192      * as a property using the .properties file as a mapping.
193      */
194     @Override
195     public void endElement(String uri, String localName, String qName) throws SAXException {
196         if (languagePrefix == null) {
197             if (attribute == null) {
198                 putPropertyInCurrentGraph(getMappedProperty(currentPath), currentText.pop().toString());
199             } else {
200                 putPropertyInCurrentGraph(attribute, currentText.pop().toString());
201                 attribute = null;
202             }
203         } else {
204             ImportHelpers.putPropertyInGraph(languageMap.get(languagePrefix), getMappedProperty(currentPath), currentText.pop().toString());
205         }
206     }
207 
208 
209     /**
210      * Insert a graph representation in the current graph (the one on top of
211      * the currentGraphPath stack) as a list item at the given key.
212      * The value stored at the key is always a list - if the key did not exist
213      * yet, a new list is created to which the sub-graph is added.
214      *
215      * @param key      name of the edge to connect the sub-graph to the current graph
216      * @param subgraph Map graph representation to insert into the current graph
217      */
218     @SuppressWarnings("unchecked")
219     protected void putSubGraphInCurrentGraph(String key, Map<String, Object> subgraph) {
220         Map<String, Object> c = currentGraphPath.peek();
221         if (c.containsKey(key)) {
222             ((List<Map<String, Object>>) c.get(key)).add(subgraph);
223         } else {
224             c.put(key, Lists.newArrayList(subgraph));
225         }
226     }
227 
228     /**
229      * Get the language from the XML attributes, if it is there.
230      * Else return 'absent'.
231      *
232      * @param attributes SAX-parsed XML attributes
233      * @return an Optional containing the language, or 'absent'
234      */
235     private Optional<String> languageAttribute(Attributes attributes) {
236         for (int attr = 0; attr < attributes.getLength(); attr++) { // only certain attributes get stored
237             String isLangAttribute = withoutNamespace(attributes.getQName(attr));
238             String prop = properties.getAttributeProperty(isLangAttribute);
239             if (LANGUAGE_OF_DESCRIPTION.equals(prop)) {
240                 logger.debug("Language detected!");
241                 return Optional.of(attributes.getValue(attr));
242             }
243         }
244         return Optional.empty();
245     }
246 
247     /**
248      * Get the element name without namespace prefix.
249      *
250      * @param qName an element QName that may have a namespace prefix
251      * @return the element name without namespace prefix
252      */
253     private String withoutNamespace(String qName) {
254         return qName.substring(qName.indexOf(":") + 1);
255     }
256 
257     /**
258      * Receives character data in SAX events, converts multiple space to a single space
259      * and puts the characters in the current node, unless the input contains only
260      * whitespace.
261      */
262     @Override
263     public void characters(char ch[], int start, int length) throws SAXException {
264         // NB: 'Blank' (whitespace) only strings are significant here, because
265         // otherwise a sequence of character, line-break, and an entity will
266         // end up being concatenated with the line-break removed. We therefore
267         // preserve all line breaks and other whitespace here and normalize
268         // it when the text gets added to the graph.
269         currentText.peek().append(ch, start, length);
270     }
271 
272     /**
273      * Stores this property value pair in the current DocumentNode.
274      * If the property already exists, it is added to the value list.
275      *
276      * @param property the property name
277      * @param value    the property value
278      */
279     protected void putPropertyInCurrentGraph(String property, String value) {
280         ImportHelpers.putPropertyInGraph(currentGraphPath.peek(), property, value);
281     }
282 
283     /**
284      * Overwrite a value in the current graph.
285      *
286      * @param property name of the property to overwrite
287      * @param value    new value for the property.
288      */
289     protected void overwritePropertyInCurrentGraph(String property, String value) {
290         ImportHelpers.overwritePropertyInGraph(currentGraphPath.peek(), property, value);
291     }
292 
293     /**
294      * Get the property name corresponding to the given path from the .properties file.
295      *
296      * @param path the stacked element names forming a path from the root to the current element
297      * @return the corresponding value to this path from the properties file. the search is inside out, so if
298      * both eadheader/ and ead/eadheader/ are specified, it will return the value for the first
299      * <p>
300      * if this path has no corresponding value in the properties file, it will return the entire path name, with _
301      * replacing the /
302      */
303     protected String getMappedProperty(Stack<String> path) {
304         return getMappedProperty(path, "", "");
305     }
306 
307     /**
308      * did/unitid/{@literal @}ehrilabel$ehri_main_identifier=objectIdentifier
309      *
310      * @param path      did/unitid/
311      * @param attribute {@literal @}ehrilabel
312      * @param value     $ehri_main_identifier
313      * @return the corresponding value to this path from the properties file. The search is inside out, so if
314      * both eadheader/ and ead/eadheader/ are specified, it will return the value for the first.
315      * <p>
316      * If this path has no corresponding value in the properties file, it will return the entire path name, with _
317      * replacing the /
318      */
319     private String getMappedProperty(Stack<String> path, String attribute, String value) {
320         StringBuilder all = new StringBuilder();
321         for (int i = path.size(); i > 0; i--) {
322             all.insert(0, path.get(i - 1) + "/");
323             String key = properties.getProperty(all + attribute + escapeValueForKey(value));
324             if (key != null) {
325                 return key;
326             }
327         }
328         return ImportHelpers.UNKNOWN_PREFIX + all.toString().replace("/", "_");
329     }
330 
331     /**
332      * If this path has no corresponding value in the properties file, it will return false
333      * <p>
334      * did/unitid/{@literal @}ehrilabel$ehri_main_identifier=objectIdentifier
335      *
336      * @param path      did/unitid/
337      * @param attribute {@literal @}ehrilabel
338      * @param value     $ehri_main_identifier
339      * @return returns true if this path is a key in the properties file.
340      */
341     private boolean isKeyInPropertyFile(Stack<String> path, String attribute, String value) {
342         logger.trace("Checking for key in property file: {}, {}, {}", path, attribute, value);
343         String all = "";
344         for (int i = path.size(); i > 0; i--) {
345             all = path.get(i - 1) + "/" + all;
346             String key = all + attribute + escapeValueForKey(value);
347             if (properties.getProperty(key) != null) {
348                 logger.trace(" FOUND Path key: {}", key);
349                 return true;
350             }
351         }
352         return false;
353     }
354 
355     private String escapeValueForKey(String value) {
356         return value.replaceAll("[\\s=:]", "_");
357     }
358 
359     /**
360      * Print a text representation of the graph on `System.out`.
361      */
362     protected void printGraph() {
363         for (String key : currentGraphPath.peek().keySet()) {
364             System.out.println(key + ":" + currentGraphPath.peek().get(key));
365         }
366     }
367 }