View Javadoc

1   /*
2    * Copyright 2015 Data Archiving and Networked Services (an institute of
3    * Koninklijke Nederlandse Akademie van Wetenschappen), King's College London,
4    * Georg-August-Universitaet Goettingen Stiftung Oeffentlichen Rechts
5    *
6    * Licensed under the EUPL, Version 1.1 or – as soon they will be approved by
7    * the European Commission - subsequent versions of the EUPL (the "Licence");
8    * You may not use this work except in compliance with the Licence.
9    * You may obtain a copy of the Licence at:
10   *
11   * https://joinup.ec.europa.eu/software/page/eupl
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the Licence is distributed on an "AS IS" basis,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the Licence for the specific language governing
17   * permissions and limitations under the Licence.
18   */
19  
20  package eu.ehri.project.importers.csv;
21  
22  import com.google.common.collect.Lists;
23  import org.slf4j.Logger;
24  import org.slf4j.LoggerFactory;
25  
26  import java.io.InputStream;
27  import java.text.ParsePosition;
28  import java.text.SimpleDateFormat;
29  import java.util.Calendar;
30  import java.util.Date;
31  import java.util.List;
32  import java.util.Locale;
33  import java.util.Scanner;
34  import java.util.regex.Matcher;
35  import java.util.regex.Pattern;
36  
37  /**
38   * Utility to convert Terezin data.
39   */
40  public class TerezinDataConverter {
41  
42      private static final Logger logger = LoggerFactory.getLogger(TerezinDataConverter.class);
43  
44      public static void readFile(InputStream stream) {
45          Scanner scanner = new Scanner(stream);
46          while (scanner.hasNextLine()) {
47              String line = scanner.nextLine();
48              String[] quotes = line.split("\"");
49              String[] values;
50              if (quotes.length == 1) {
51                  values = line.split(";");
52              } else {
53                  if (quotes[0].length() <= 1) {
54                      logger.error("Problem found at line: {}: {}", line, quotes[0]);
55                      break;
56                  }
57                  values = new String[2];
58                  values[0] = quotes[0].substring(0, quotes[0].length() - 2);
59                  values[1] = quotes[1];
60              }
61              if (values.length == 2) {
62                  String[] datevalues = values[1].split(";");
63                  for (String datevalue : datevalues) {
64                      List<Calendar> list = parseDate(datevalue);
65                      if (list == null) {
66                          //parse failed, print the date and the identifier
67                          logger.error(values[0] + " := " + datevalue);
68                      }
69                  }
70              } else {
71                  logger.error(values[0] + " has a problem: " + line);
72              }
73          }
74          scanner.close();
75      }
76  
77      public static List<Calendar> parseDate(String datevalue) {
78          String trimmedDate = datevalue.trim();
79          List<Calendar> dates = Lists.newArrayList();
80          dates.add(Calendar.getInstance());
81  
82          if (trimmedDate.startsWith("before ")) {
83              dates = parseDate(trimmedDate.substring(6));
84              if (dates == null)
85                  return null;
86  
87              dates.add(0, Calendar.getInstance());
88              dates.get(0).set(Calendar.YEAR, 1900);
89              dates.get(0).set(Calendar.DATE, 1);
90              dates.get(0).set(Calendar.MONTH, Calendar.JANUARY);
91              return dates;
92          }
93          if (trimmedDate.startsWith("after ")) {
94              dates = parseDate(trimmedDate.substring(5));
95              if (dates == null)
96                  return null;
97              dates.get(0).set(Calendar.YEAR, dates.get(0).get(Calendar.YEAR) + 1);
98              dates.add(1, Calendar.getInstance());
99              return dates;
100         }
101 
102         // April 1942
103         //July 1946
104         ParsePosition p = new ParsePosition(0);
105         SimpleDateFormat monthDateFormat = new SimpleDateFormat("MMM yyyy", Locale.US);
106         Date d = monthDateFormat.parse(trimmedDate, p);
107         if (p.getIndex() > 0) {
108             dates.get(0).setTime(d);
109             dates.add(Calendar.getInstance());
110             dates.get(1).setTime(d);
111 
112             dates.get(0).set(Calendar.DATE, 1);
113 
114             dates.get(1).set(Calendar.MONTH, dates.get(1).get(Calendar.MONTH) + 1);
115             dates.get(1).set(Calendar.DATE, 0);
116             return dates;
117         }
118 
119 
120         Matcher m;
121         //1941 - June 1943
122         if ((m = parseDate(trimmedDate, "(\\d{4})\\s*-\\s*(\\D+\\s+\\d{4})")) != null) {
123             dates.get(0).set(Calendar.YEAR, new Integer(m.group(1)));
124             dates.get(0).set(Calendar.MONTH, Calendar.JANUARY);
125             dates.get(0).set(Calendar.DATE, 1);
126 
127             dates.add(Calendar.getInstance());
128             d = monthDateFormat.parse(m.group(2), p);
129             if (p.getIndex() > 0) {
130                 dates.get(1).setTime(d);
131                 dates.get(1).set(Calendar.MONTH, dates.get(1).get(Calendar.MONTH) + 1);
132                 dates.get(1).set(Calendar.DATE, 0);
133             }
134 
135             return dates;
136         }
137         //May - June 1945
138         if ((m = parseDate(trimmedDate, "(\\D*)\\s*-\\s*(\\D+\\s+\\d{4})")) != null) {
139 
140             SimpleDateFormat month = new SimpleDateFormat("MMM", Locale.US);
141             d = month.parse(m.group(1), p);
142             if (p.getIndex() > 0) {
143                 dates.get(0).setTime(d);
144                 dates.get(0).set(Calendar.DATE, 1);
145             }
146 
147             dates.add(Calendar.getInstance());
148             p.setIndex(0);
149             d = monthDateFormat.parse(m.group(2), p);
150             if (p.getIndex() > 0) {
151                 dates.get(1).setTime(d);
152                 dates.get(1).set(Calendar.MONTH, dates.get(1).get(Calendar.MONTH) + 1);
153                 dates.get(1).set(Calendar.DATE, 0);
154                 dates.get(0).set(Calendar.YEAR, dates.get(1).get(Calendar.YEAR));
155             }
156 
157             return dates;
158         }
159 
160         //1.11.1943-10.11.1943
161         if ((m = parseDate(trimmedDate, "(\\d+)\\.(\\d+)\\.(\\d{4}).*?-.*?(\\d+)\\.(\\d+)\\.(\\d{4})")) != null) {
162             dates.get(0).set(Calendar.DATE, new Integer(m.group(1)));
163             dates.get(0).set(Calendar.MONTH, new Integer(m.group(2)) - 1); //Calendar.month counts from 0 ... 11
164             dates.get(0).set(Calendar.YEAR, new Integer(m.group(3)));
165 
166             dates.add(Calendar.getInstance());
167             dates.get(1).set(Calendar.DATE, new Integer(m.group(4)));
168             dates.get(1).set(Calendar.MONTH, new Integer(m.group(5)) - 1);
169             dates.get(1).set(Calendar.YEAR, new Integer(m.group(6)));
170 
171             return dates;
172         }
173         //1.12. - 24.12.1942
174         if ((m = parseDate(trimmedDate, "(\\d+)\\.(\\d+)\\..*?-.*?(\\d*)\\.(\\d*)\\.(\\d{4})")) != null) {
175             dates.get(0).set(Calendar.DATE, new Integer(m.group(1)));
176             dates.get(0).set(Calendar.MONTH, new Integer(m.group(2)) - 1); //Calendar.month counts from 0 ... 11
177             dates.get(0).set(Calendar.YEAR, new Integer(m.group(5)));
178 
179             dates.add(Calendar.getInstance());
180             dates.get(1).set(Calendar.DATE, new Integer(m.group(3)));
181             dates.get(1).set(Calendar.MONTH, new Integer(m.group(4)) - 1);
182             dates.get(1).set(Calendar.YEAR, new Integer(m.group(5)));
183 
184             return dates;
185         }
186         // 1. - 30.11.1942
187         if ((m = parseDate(trimmedDate, "(\\d+)\\.\\s*?-\\s*?(\\d*)\\.(\\d*)\\.(\\d{4})")) != null) {
188             dates.get(0).set(Calendar.DATE, new Integer(m.group(1)));
189             dates.get(0).set(Calendar.MONTH, new Integer(m.group(3)) - 1); //Calendar.month counts from 0 ... 11
190             dates.get(0).set(Calendar.YEAR, new Integer(m.group(4)));
191 
192             dates.add(Calendar.getInstance());
193             dates.get(1).set(Calendar.DATE, new Integer(m.group(2)));
194             dates.get(1).set(Calendar.MONTH, new Integer(m.group(3)) - 1);
195             dates.get(1).set(Calendar.YEAR, new Integer(m.group(4)));
196 
197             return dates;
198         }
199         // 1940 - 1942
200         if ((m = parseDate(trimmedDate, "(\\d{4})\\s*-\\s*(\\d{4})")) != null) {
201             dates.get(0).set(Calendar.YEAR, new Integer(m.group(1)));
202             dates.get(0).set(Calendar.MONTH, Calendar.JANUARY);
203             dates.get(0).set(Calendar.DATE, 1);
204 
205             dates.add(Calendar.getInstance());
206             dates.get(1).set(Calendar.YEAR, new Integer(m.group(2)));
207             dates.get(1).set(Calendar.MONTH, Calendar.DECEMBER);
208             dates.get(1).set(Calendar.DATE, 31);
209 
210             return dates;
211         }
212         // 1940 - 42
213         if ((m = parseDate(trimmedDate, "(\\d{4})\\s*-\\s*(\\d{2})")) != null) {
214             dates.get(0).set(Calendar.YEAR, new Integer(m.group(1)));
215             dates.get(0).set(Calendar.MONTH, Calendar.JANUARY);
216             dates.get(0).set(Calendar.DATE, 1);
217 
218             dates.add(Calendar.getInstance());
219             dates.get(1).set(Calendar.YEAR, 1900 + new Integer(m.group(2)));
220             dates.get(1).set(Calendar.MONTH, Calendar.DECEMBER);
221             dates.get(1).set(Calendar.DATE, 31);
222 
223             return dates;
224         }
225         //20.3.1942
226         if ((m = parseDate(trimmedDate, "(\\d*)\\.(\\d*)\\.(\\d{4})")) != null) {
227             dates.get(0).set(Calendar.DATE, new Integer(m.group(1)));
228             dates.get(0).set(Calendar.MONTH, new Integer(m.group(2)) - 1);
229             dates.get(0).set(Calendar.YEAR, new Integer(m.group(3)));
230             return dates;
231         }
232         //05.03.43
233         if ((m = parseDate(trimmedDate, "(\\d*)\\.(\\d*)\\.(\\d{2})")) != null) {
234             dates.get(0).set(Calendar.DATE, new Integer(m.group(1)));
235             dates.get(0).set(Calendar.MONTH, new Integer(m.group(2)) - 1);
236             dates.get(0).set(Calendar.YEAR, 1900 + new Integer(m.group(3)));
237             return dates;
238         }
239         //1942
240         if ((m = parseDate(trimmedDate, "(\\d{4})")) != null) {
241             dates.get(0).set(Calendar.YEAR, new Integer(m.group(1)));
242             dates.get(0).set(Calendar.MONTH, Calendar.JANUARY);
243             dates.get(0).set(Calendar.DATE, 1);
244 
245             dates.add(Calendar.getInstance());
246             dates.get(1).set(Calendar.YEAR, new Integer(m.group(1)));
247             dates.get(1).set(Calendar.MONTH, Calendar.DECEMBER);
248             dates.get(1).set(Calendar.DATE, 31);
249 
250             return dates;
251         }
252         return null;
253     }
254 
255     private static Matcher parseDate(String datevalue, String pattern) {
256         Pattern yearPattern = Pattern.compile(pattern);
257         Matcher matcher = yearPattern.matcher(datevalue);
258         if (matcher.matches()) {
259             return matcher;
260         }
261         return null;
262     }
263 }