View Javadoc

1   package eu.ehri.project.importers.util;
2   
3   import com.google.common.collect.Lists;
4   import com.google.common.collect.Maps;
5   import eu.ehri.project.definitions.Entities;
6   import eu.ehri.project.definitions.Ontology;
7   import eu.ehri.project.importers.properties.XmlImportProperties;
8   import org.joda.time.DateTime;
9   import org.joda.time.format.DateTimeFormatter;
10  import org.joda.time.format.ISODateTimeFormat;
11  
12  import java.text.ParsePosition;
13  import java.text.SimpleDateFormat;
14  import java.util.List;
15  import java.util.Locale;
16  import java.util.Map;
17  import java.util.Optional;
18  import java.util.regex.Matcher;
19  import java.util.regex.Pattern;
20  
21  import static eu.ehri.project.importers.util.ImportHelpers.getSubNode;
22  
23  class DateParser {
24  
25      // Various date patterns
26      private static final Pattern[] datePatterns = {
27              // Yad Vashem, ICA-Atom style: 1924-1-1 - 1947-12-31
28              // Yad Vashem in Wp2: 12-15-1941, 9-30-1944
29              Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})\\s?-\\s?(\\d{4}-\\d{1,2}-\\d{1,2})$"),
30              Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})$"),
31              Pattern.compile("^(\\d{4})\\s?-\\s?(\\d{4})$"),
32              Pattern.compile("^(\\d{4})-\\[(\\d{4})\\]$"),
33              Pattern.compile("^(\\d{4})-\\[(\\d{4})\\]$"),
34              Pattern.compile("^(\\d{4}s)-\\[(\\d{4}s)\\]$"),
35              Pattern.compile("^\\[(\\d{4})\\]$"),
36              Pattern.compile("^(\\d{4})$"),
37              Pattern.compile("^(\\d{2})th century$"),
38              Pattern.compile("^\\s*(\\d{4})\\s*-\\s*(\\d{4})"),
39              //bundesarchive: 1906/19
40              Pattern.compile("^\\s*(\\d{4})/(\\d{2})"),
41              Pattern.compile("^\\s*(\\d{4})\\s*/\\s*(\\d{4})"),
42              Pattern.compile("^(\\d{4}-\\d{1,2})/(\\d{4}-\\d{1,2})"),
43              Pattern.compile("^(\\d{4}-\\d{1,2}-\\d{1,2})/(\\d{4}-\\d{1,2}-\\d{1,2})"),
44              Pattern.compile("^(\\d{4})/(\\d{4}-\\d{1,2}-\\d{1,2})")
45      };
46  
47      // NB: Using English locale here to avoid ambiguities caused by system dependent
48      // time zones such as: Cannot parse "1940-05-16": Illegal instant due to time zone
49      // offset transition (Europe/Amsterdam)
50      // https://en.wikipedia.org/wiki/UTC%2B00:20
51      private static final DateTimeFormatter isoDateTimeFormat = ISODateTimeFormat.date()
52              .withLocale(Locale.ENGLISH);
53  
54      // NB: Not static yet since these objects aren't thread safe :(
55      private static final SimpleDateFormat yearMonthDateFormat = new SimpleDateFormat("yyyy-MM");
56      private static final SimpleDateFormat yearDateFormat = new SimpleDateFormat("yyyy");
57      private static final XmlImportProperties dates = new XmlImportProperties("dates.properties");
58  
59  
60      static List<Map<String, Object>> extractDates(Map<String, Object> data) {
61          List<Map<String, Object>> extractedDates = Lists.newArrayList();
62  
63          for (String key : data.keySet()) {
64              if (key.equals(Entities.DATE_PERIOD) && data.get(key) instanceof List) {
65                  for (Map<String, Object> event : (List<Map<String, Object>>) data.get(key)) {
66                      extractedDates.add(getSubNode(event));
67                  }
68              }
69          }
70  
71          Map<String, String> dateValues = returnDatesAsString(data);
72          for (String s : dateValues.keySet()) {
73              extractDate(s).ifPresent(extractedDates::add);
74          }
75          replaceDates(data, extractedDates);
76          return extractedDates;
77      }
78  
79      private static void replaceDates(Map<String, Object> data, List<Map<String, Object>> extractedDates) {
80          Map<String, String> dateValues = returnDatesAsString(data);
81          Map<String, String> dateTypes = Maps.newHashMap();
82          for (String dateValue : dateValues.keySet()) {
83              dateTypes.put(dateValues.get(dateValue), null);
84          }
85          for (Map<String, Object> dateMap : extractedDates) {
86              dateValues.remove(dateMap.get(Ontology.DATE_HAS_DESCRIPTION));
87          }
88          //replace dates in data map
89          for (String datevalue : dateValues.keySet()) {
90              String dateType = dateValues.get(datevalue);
91              if (dateTypes.containsKey(dateType) && dateTypes.get(dateType) != null) {
92                  dateTypes.put(dateType, dateTypes.get(dateType) + ", " + datevalue.trim());
93              } else {
94                  dateTypes.put(dateType, datevalue.trim());
95              }
96          }
97          for (String dateType : dateTypes.keySet()) {
98              if (dateTypes.get(dateType) == null) {
99                  data.remove(dateType);
100             } else {
101                 data.put(dateType, dateTypes.get(dateType));
102             }
103         }
104     }
105 
106     private static Optional<Map<String, Object>> extractDate(String date) {
107         Map<String, Object> data = matchDate(date);
108         return data.isEmpty() ? Optional.empty() : Optional.of(data);
109     }
110 
111     private static Map<String, Object> matchDate(String date) {
112         Map<String, Object> data = Maps.newHashMap();
113         for (Pattern re : datePatterns) {
114             Matcher matcher = re.matcher(date);
115             if (matcher.matches()) {
116                 data.put(Ontology.DATE_PERIOD_START_DATE, normaliseDate(matcher.group(1)));
117                 data.put(Ontology.DATE_PERIOD_END_DATE, normaliseDate(matcher.group(matcher
118                         .groupCount() > 1 ? 2 : 1), true));
119                 data.put(Ontology.DATE_HAS_DESCRIPTION, date);
120                 break;
121             }
122         }
123         return data;
124     }
125 
126     private static Map<String, String> returnDatesAsString(Map<String, Object> data) {
127         Map<String, String> datesAsString = Maps.newHashMap();
128         Object value;
129         for (Map.Entry<String, Object> property : data.entrySet()) {
130             if (dates.containsProperty(property.getKey()) && (value = property.getValue()) != null) {
131                 if (property.getValue() instanceof String) {
132                     String dateValue = (String) value;
133                     for (String d : dateValue.split(",")) {
134                         datesAsString.put(d, property.getKey());
135                     }
136                 } else if (property.getValue() instanceof List) {
137                     for (String s : (List<String>) value) {
138                         datesAsString.put(s, property.getKey());
139                     }
140                 }
141             }
142         }
143         return datesAsString;
144     }
145 
146     static String normaliseDate(String date) {
147         return normaliseDate(date, false);
148     }
149 
150     /**
151      * Normalise a date in a string.
152      *
153      * @param date        a String date that needs formatting
154      * @param endOfPeriod a string signifying whether this date is the begin of
155      *                    a period or the end of a period
156      * @return a String containing the formatted date.
157      */
158     static String normaliseDate(String date, boolean endOfPeriod) {
159         String returnDate = isoDateTimeFormat.print(DateTime.parse(date));
160         if (returnDate.startsWith("00")) {
161             returnDate = "19" + returnDate.substring(2);
162             date = "19" + date;
163         }
164         if (endOfPeriod) {
165             if (!date.equals(returnDate)) {
166                 ParsePosition p = new ParsePosition(0);
167                 yearMonthDateFormat.parse(date, p);
168                 if (p.getIndex() > 0) {
169                     returnDate = isoDateTimeFormat.print(DateTime.parse(date).plusMonths(1).minusDays(1));
170                 } else {
171                     p = new ParsePosition(0);
172                     yearDateFormat.parse(date, p);
173                     if (p.getIndex() > 0) {
174                         returnDate = isoDateTimeFormat.print(DateTime.parse(date).plusYears(1).minusDays(1));
175                     }
176                 }
177             }
178         }
179         return returnDate;
180     }
181 }