1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 package eu.ehri.project.importers.csv;
21
22 import com.google.common.collect.Lists;
23 import org.slf4j.Logger;
24 import org.slf4j.LoggerFactory;
25
26 import java.io.InputStream;
27 import java.text.ParsePosition;
28 import java.text.SimpleDateFormat;
29 import java.util.Calendar;
30 import java.util.Date;
31 import java.util.List;
32 import java.util.Locale;
33 import java.util.Scanner;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
36
37
38
39
40 public class TerezinDataConverter {
41
42 private static final Logger logger = LoggerFactory.getLogger(TerezinDataConverter.class);
43
44 public static void readFile(InputStream stream) {
45 Scanner scanner = new Scanner(stream);
46 while (scanner.hasNextLine()) {
47 String line = scanner.nextLine();
48 String[] quotes = line.split("\"");
49 String[] values;
50 if (quotes.length == 1) {
51 values = line.split(";");
52 } else {
53 if (quotes[0].length() <= 1) {
54 logger.error("Problem found at line: {}: {}", line, quotes[0]);
55 break;
56 }
57 values = new String[2];
58 values[0] = quotes[0].substring(0, quotes[0].length() - 2);
59 values[1] = quotes[1];
60 }
61 if (values.length == 2) {
62 String[] datevalues = values[1].split(";");
63 for (String datevalue : datevalues) {
64 List<Calendar> list = parseDate(datevalue);
65 if (list == null) {
66
67 logger.error(values[0] + " := " + datevalue);
68 }
69 }
70 } else {
71 logger.error(values[0] + " has a problem: " + line);
72 }
73 }
74 scanner.close();
75 }
76
77 public static List<Calendar> parseDate(String datevalue) {
78 String trimmedDate = datevalue.trim();
79 List<Calendar> dates = Lists.newArrayList();
80 dates.add(Calendar.getInstance());
81
82 if (trimmedDate.startsWith("before ")) {
83 dates = parseDate(trimmedDate.substring(6));
84 if (dates == null)
85 return null;
86
87 dates.add(0, Calendar.getInstance());
88 dates.get(0).set(Calendar.YEAR, 1900);
89 dates.get(0).set(Calendar.DATE, 1);
90 dates.get(0).set(Calendar.MONTH, Calendar.JANUARY);
91 return dates;
92 }
93 if (trimmedDate.startsWith("after ")) {
94 dates = parseDate(trimmedDate.substring(5));
95 if (dates == null)
96 return null;
97 dates.get(0).set(Calendar.YEAR, dates.get(0).get(Calendar.YEAR) + 1);
98 dates.add(1, Calendar.getInstance());
99 return dates;
100 }
101
102
103
104 ParsePosition p = new ParsePosition(0);
105 SimpleDateFormat monthDateFormat = new SimpleDateFormat("MMM yyyy", Locale.US);
106 Date d = monthDateFormat.parse(trimmedDate, p);
107 if (p.getIndex() > 0) {
108 dates.get(0).setTime(d);
109 dates.add(Calendar.getInstance());
110 dates.get(1).setTime(d);
111
112 dates.get(0).set(Calendar.DATE, 1);
113
114 dates.get(1).set(Calendar.MONTH, dates.get(1).get(Calendar.MONTH) + 1);
115 dates.get(1).set(Calendar.DATE, 0);
116 return dates;
117 }
118
119
120 Matcher m;
121
122 if ((m = parseDate(trimmedDate, "(\\d{4})\\s*-\\s*(\\D+\\s+\\d{4})")) != null) {
123 dates.get(0).set(Calendar.YEAR, new Integer(m.group(1)));
124 dates.get(0).set(Calendar.MONTH, Calendar.JANUARY);
125 dates.get(0).set(Calendar.DATE, 1);
126
127 dates.add(Calendar.getInstance());
128 d = monthDateFormat.parse(m.group(2), p);
129 if (p.getIndex() > 0) {
130 dates.get(1).setTime(d);
131 dates.get(1).set(Calendar.MONTH, dates.get(1).get(Calendar.MONTH) + 1);
132 dates.get(1).set(Calendar.DATE, 0);
133 }
134
135 return dates;
136 }
137
138 if ((m = parseDate(trimmedDate, "(\\D*)\\s*-\\s*(\\D+\\s+\\d{4})")) != null) {
139
140 SimpleDateFormat month = new SimpleDateFormat("MMM", Locale.US);
141 d = month.parse(m.group(1), p);
142 if (p.getIndex() > 0) {
143 dates.get(0).setTime(d);
144 dates.get(0).set(Calendar.DATE, 1);
145 }
146
147 dates.add(Calendar.getInstance());
148 p.setIndex(0);
149 d = monthDateFormat.parse(m.group(2), p);
150 if (p.getIndex() > 0) {
151 dates.get(1).setTime(d);
152 dates.get(1).set(Calendar.MONTH, dates.get(1).get(Calendar.MONTH) + 1);
153 dates.get(1).set(Calendar.DATE, 0);
154 dates.get(0).set(Calendar.YEAR, dates.get(1).get(Calendar.YEAR));
155 }
156
157 return dates;
158 }
159
160
161 if ((m = parseDate(trimmedDate, "(\\d+)\\.(\\d+)\\.(\\d{4}).*?-.*?(\\d+)\\.(\\d+)\\.(\\d{4})")) != null) {
162 dates.get(0).set(Calendar.DATE, new Integer(m.group(1)));
163 dates.get(0).set(Calendar.MONTH, new Integer(m.group(2)) - 1);
164 dates.get(0).set(Calendar.YEAR, new Integer(m.group(3)));
165
166 dates.add(Calendar.getInstance());
167 dates.get(1).set(Calendar.DATE, new Integer(m.group(4)));
168 dates.get(1).set(Calendar.MONTH, new Integer(m.group(5)) - 1);
169 dates.get(1).set(Calendar.YEAR, new Integer(m.group(6)));
170
171 return dates;
172 }
173
174 if ((m = parseDate(trimmedDate, "(\\d+)\\.(\\d+)\\..*?-.*?(\\d*)\\.(\\d*)\\.(\\d{4})")) != null) {
175 dates.get(0).set(Calendar.DATE, new Integer(m.group(1)));
176 dates.get(0).set(Calendar.MONTH, new Integer(m.group(2)) - 1);
177 dates.get(0).set(Calendar.YEAR, new Integer(m.group(5)));
178
179 dates.add(Calendar.getInstance());
180 dates.get(1).set(Calendar.DATE, new Integer(m.group(3)));
181 dates.get(1).set(Calendar.MONTH, new Integer(m.group(4)) - 1);
182 dates.get(1).set(Calendar.YEAR, new Integer(m.group(5)));
183
184 return dates;
185 }
186
187 if ((m = parseDate(trimmedDate, "(\\d+)\\.\\s*?-\\s*?(\\d*)\\.(\\d*)\\.(\\d{4})")) != null) {
188 dates.get(0).set(Calendar.DATE, new Integer(m.group(1)));
189 dates.get(0).set(Calendar.MONTH, new Integer(m.group(3)) - 1);
190 dates.get(0).set(Calendar.YEAR, new Integer(m.group(4)));
191
192 dates.add(Calendar.getInstance());
193 dates.get(1).set(Calendar.DATE, new Integer(m.group(2)));
194 dates.get(1).set(Calendar.MONTH, new Integer(m.group(3)) - 1);
195 dates.get(1).set(Calendar.YEAR, new Integer(m.group(4)));
196
197 return dates;
198 }
199
200 if ((m = parseDate(trimmedDate, "(\\d{4})\\s*-\\s*(\\d{4})")) != null) {
201 dates.get(0).set(Calendar.YEAR, new Integer(m.group(1)));
202 dates.get(0).set(Calendar.MONTH, Calendar.JANUARY);
203 dates.get(0).set(Calendar.DATE, 1);
204
205 dates.add(Calendar.getInstance());
206 dates.get(1).set(Calendar.YEAR, new Integer(m.group(2)));
207 dates.get(1).set(Calendar.MONTH, Calendar.DECEMBER);
208 dates.get(1).set(Calendar.DATE, 31);
209
210 return dates;
211 }
212
213 if ((m = parseDate(trimmedDate, "(\\d{4})\\s*-\\s*(\\d{2})")) != null) {
214 dates.get(0).set(Calendar.YEAR, new Integer(m.group(1)));
215 dates.get(0).set(Calendar.MONTH, Calendar.JANUARY);
216 dates.get(0).set(Calendar.DATE, 1);
217
218 dates.add(Calendar.getInstance());
219 dates.get(1).set(Calendar.YEAR, 1900 + new Integer(m.group(2)));
220 dates.get(1).set(Calendar.MONTH, Calendar.DECEMBER);
221 dates.get(1).set(Calendar.DATE, 31);
222
223 return dates;
224 }
225
226 if ((m = parseDate(trimmedDate, "(\\d*)\\.(\\d*)\\.(\\d{4})")) != null) {
227 dates.get(0).set(Calendar.DATE, new Integer(m.group(1)));
228 dates.get(0).set(Calendar.MONTH, new Integer(m.group(2)) - 1);
229 dates.get(0).set(Calendar.YEAR, new Integer(m.group(3)));
230 return dates;
231 }
232
233 if ((m = parseDate(trimmedDate, "(\\d*)\\.(\\d*)\\.(\\d{2})")) != null) {
234 dates.get(0).set(Calendar.DATE, new Integer(m.group(1)));
235 dates.get(0).set(Calendar.MONTH, new Integer(m.group(2)) - 1);
236 dates.get(0).set(Calendar.YEAR, 1900 + new Integer(m.group(3)));
237 return dates;
238 }
239
240 if ((m = parseDate(trimmedDate, "(\\d{4})")) != null) {
241 dates.get(0).set(Calendar.YEAR, new Integer(m.group(1)));
242 dates.get(0).set(Calendar.MONTH, Calendar.JANUARY);
243 dates.get(0).set(Calendar.DATE, 1);
244
245 dates.add(Calendar.getInstance());
246 dates.get(1).set(Calendar.YEAR, new Integer(m.group(1)));
247 dates.get(1).set(Calendar.MONTH, Calendar.DECEMBER);
248 dates.get(1).set(Calendar.DATE, 31);
249
250 return dates;
251 }
252 return null;
253 }
254
255 private static Matcher parseDate(String datevalue, String pattern) {
256 Pattern yearPattern = Pattern.compile(pattern);
257 Matcher matcher = yearPattern.matcher(datevalue);
258 if (matcher.matches()) {
259 return matcher;
260 }
261 return null;
262 }
263 }