View Javadoc

1   package eu.ehri.project.utils;
2   
3   import com.google.common.base.Splitter;
4   import com.google.common.collect.ImmutableBiMap;
5   import com.google.common.collect.ImmutableMap;
6   import com.google.common.collect.Lists;
7   import com.google.common.collect.Maps;
8   import eu.ehri.project.models.base.Described;
9   import eu.ehri.project.models.base.Description;
10  import eu.ehri.project.models.base.Entity;
11  
12  import java.util.Collections;
13  import java.util.Comparator;
14  import java.util.List;
15  import java.util.Locale;
16  import java.util.Map;
17  import java.util.Optional;
18  
19  
20  /**
21   * Utilities for language code conversions.
22   */
23  public class LanguageHelpers {
24  
25      private static final Map<String, Locale> locale2Map;
26      private static final Map<String, String> locale3Map;
27      private static final Map<String, Locale> localeNameMap;
28  
29      static {
30          String[] languages = Locale.getISOLanguages();
31          locale2Map = Maps.newHashMapWithExpectedSize(languages.length);
32          locale3Map = Maps.newHashMapWithExpectedSize(languages.length);
33          localeNameMap = Maps.newHashMapWithExpectedSize(languages.length);
34          for (String language : languages) {
35              Locale locale = new Locale(language);
36              locale2Map.put(language, locale);
37              locale3Map.put(locale.getISO3Language(), language);
38              localeNameMap.put(locale.getDisplayLanguage(Locale.ENGLISH).toLowerCase(), locale);
39          }
40      }
41  
42      // Splitter for breaking up codes
43      private static final Splitter codeSplitter = Splitter.on("-").omitEmptyStrings().limit(2);
44  
45      /**
46       * Limited selection of bibliographical 3-letter codes for the languages
47       * we're most likely to run into, and their mappings to ISO639-2 Term codes.
48       */
49      private static final ImmutableBiMap<String, String> iso639BibTermLookup = ImmutableBiMap.<String, String>builder()
50              .put("alb", "sqi") // albanian
51              .put("arm", "hye") // armenian
52              .put("baq", "eus") // basque
53              .put("ger", "deu") // german
54              .put("dut", "nld") // dutch
55              .put("rum", "ron") // romanian
56              .put("mac", "mkd") // macedonian
57              .put("slo", "slk") // slovak
58              .put("fre", "fra") // french
59              .put("cze", "ces") // czech
60              .build();
61  
62      /**
63       * Continent names as defined by the EAG schema
64       */
65      private static final ImmutableBiMap<String, String> continentCodes = ImmutableBiMap.<String, String>builder()
66              .put("AF", "Africa")
67              .put("AN", "Antarctica")
68              .put("AS", "Asia")
69              .put("EU", "Europe")
70              .put("NA", "North America")
71              .put("OC", "Australia")
72              .put("SA", "South America")
73              .build();
74  
75      private static final ImmutableMap<String, String> countryCodesToContinents = ImmutableMap.<String, String>builder()
76              .put("AD", "EU")
77              .put("AE", "AS")
78              .put("AF", "AS")
79              .put("AG", "NA")
80              .put("AI", "NA")
81              .put("AL", "EU")
82              .put("AM", "AS")
83              .put("AN", "NA")
84              .put("AO", "AF")
85              .put("AP", "AS")
86              .put("AQ", "AN")
87              .put("AR", "SA")
88              .put("AS", "OC")
89              .put("AT", "EU")
90              .put("AU", "OC")
91              .put("AW", "NA")
92              .put("AX", "EU")
93              .put("AZ", "AS")
94              .put("BA", "EU")
95              .put("BB", "NA")
96              .put("BD", "AS")
97              .put("BE", "EU")
98              .put("BF", "AF")
99              .put("BG", "EU")
100             .put("BH", "AS")
101             .put("BI", "AF")
102             .put("BJ", "AF")
103             .put("BL", "NA")
104             .put("BM", "NA")
105             .put("BN", "AS")
106             .put("BO", "SA")
107             .put("BR", "SA")
108             .put("BS", "NA")
109             .put("BT", "AS")
110             .put("BV", "AN")
111             .put("BW", "AF")
112             .put("BY", "EU")
113             .put("BZ", "NA")
114             .put("CA", "NA")
115             .put("CC", "AS")
116             .put("CD", "AF")
117             .put("CF", "AF")
118             .put("CG", "AF")
119             .put("CH", "EU")
120             .put("CI", "AF")
121             .put("CK", "OC")
122             .put("CL", "SA")
123             .put("CM", "AF")
124             .put("CN", "AS")
125             .put("CO", "SA")
126             .put("CR", "NA")
127             .put("CU", "NA")
128             .put("CV", "AF")
129             .put("CX", "AS")
130             .put("CY", "AS")
131             .put("CZ", "EU")
132             .put("DE", "EU")
133             .put("DJ", "AF")
134             .put("DK", "EU")
135             .put("DM", "NA")
136             .put("DO", "NA")
137             .put("DZ", "AF")
138             .put("EC", "SA")
139             .put("EE", "EU")
140             .put("EG", "AF")
141             .put("EH", "AF")
142             .put("ER", "AF")
143             .put("ES", "EU")
144             .put("ET", "AF")
145             .put("EU", "EU")
146             .put("FI", "EU")
147             .put("FJ", "OC")
148             .put("FK", "SA")
149             .put("FM", "OC")
150             .put("FO", "EU")
151             .put("FR", "EU")
152             .put("FX", "EU")
153             .put("GA", "AF")
154             .put("GB", "EU")
155             .put("GD", "NA")
156             .put("GE", "AS")
157             .put("GF", "SA")
158             .put("GG", "EU")
159             .put("GH", "AF")
160             .put("GI", "EU")
161             .put("GL", "NA")
162             .put("GM", "AF")
163             .put("GN", "AF")
164             .put("GP", "NA")
165             .put("GQ", "AF")
166             .put("GR", "EU")
167             .put("GS", "AN")
168             .put("GT", "NA")
169             .put("GU", "OC")
170             .put("GW", "AF")
171             .put("GY", "SA")
172             .put("HK", "AS")
173             .put("HM", "AN")
174             .put("HN", "NA")
175             .put("HR", "EU")
176             .put("HT", "NA")
177             .put("HU", "EU")
178             .put("ID", "AS")
179             .put("IE", "EU")
180             .put("IL", "AS")
181             .put("IM", "EU")
182             .put("IN", "AS")
183             .put("IO", "AS")
184             .put("IQ", "AS")
185             .put("IR", "AS")
186             .put("IS", "EU")
187             .put("IT", "EU")
188             .put("JE", "EU")
189             .put("JM", "NA")
190             .put("JO", "AS")
191             .put("JP", "AS")
192             .put("KE", "AF")
193             .put("KG", "AS")
194             .put("KH", "AS")
195             .put("KI", "OC")
196             .put("KM", "AF")
197             .put("KN", "NA")
198             .put("KP", "AS")
199             .put("KR", "AS")
200             .put("KW", "AS")
201             .put("KY", "NA")
202             .put("KZ", "AS")
203             .put("LA", "AS")
204             .put("LB", "AS")
205             .put("LC", "NA")
206             .put("LI", "EU")
207             .put("LK", "AS")
208             .put("LR", "AF")
209             .put("LS", "AF")
210             .put("LT", "EU")
211             .put("LU", "EU")
212             .put("LV", "EU")
213             .put("LY", "AF")
214             .put("MA", "AF")
215             .put("MC", "EU")
216             .put("MD", "EU")
217             .put("ME", "EU")
218             .put("MF", "NA")
219             .put("MG", "AF")
220             .put("MH", "OC")
221             .put("MK", "EU")
222             .put("ML", "AF")
223             .put("MM", "AS")
224             .put("MN", "AS")
225             .put("MO", "AS")
226             .put("MP", "OC")
227             .put("MQ", "NA")
228             .put("MR", "AF")
229             .put("MS", "NA")
230             .put("MT", "EU")
231             .put("MU", "AF")
232             .put("MV", "AS")
233             .put("MW", "AF")
234             .put("MX", "NA")
235             .put("MY", "AS")
236             .put("MZ", "AF")
237             .put("NA", "AF")
238             .put("NC", "OC")
239             .put("NE", "AF")
240             .put("NF", "OC")
241             .put("NG", "AF")
242             .put("NI", "NA")
243             .put("NL", "EU")
244             .put("NO", "EU")
245             .put("NP", "AS")
246             .put("NR", "OC")
247             .put("NU", "OC")
248             .put("NZ", "OC")
249             .put("O1", "--")
250             .put("OM", "AS")
251             .put("PA", "NA")
252             .put("PE", "SA")
253             .put("PF", "OC")
254             .put("PG", "OC")
255             .put("PH", "AS")
256             .put("PK", "AS")
257             .put("PL", "EU")
258             .put("PM", "NA")
259             .put("PN", "OC")
260             .put("PR", "NA")
261             .put("PS", "AS")
262             .put("PT", "EU")
263             .put("PW", "OC")
264             .put("PY", "SA")
265             .put("QA", "AS")
266             .put("RE", "AF")
267             .put("RO", "EU")
268             .put("RS", "EU")
269             .put("RU", "EU")
270             .put("RW", "AF")
271             .put("SA", "AS")
272             .put("SB", "OC")
273             .put("SC", "AF")
274             .put("SD", "AF")
275             .put("SE", "EU")
276             .put("SG", "AS")
277             .put("SH", "AF")
278             .put("SI", "EU")
279             .put("SJ", "EU")
280             .put("SK", "EU")
281             .put("SL", "AF")
282             .put("SM", "EU")
283             .put("SN", "AF")
284             .put("SO", "AF")
285             .put("SR", "SA")
286             .put("ST", "AF")
287             .put("SV", "NA")
288             .put("SY", "AS")
289             .put("SZ", "AF")
290             .put("TC", "NA")
291             .put("TD", "AF")
292             .put("TF", "AN")
293             .put("TG", "AF")
294             .put("TH", "AS")
295             .put("TJ", "AS")
296             .put("TK", "OC")
297             .put("TL", "AS")
298             .put("TM", "AS")
299             .put("TN", "AF")
300             .put("TO", "OC")
301             .put("TR", "EU")
302             .put("TT", "NA")
303             .put("TV", "OC")
304             .put("TW", "AS")
305             .put("TZ", "AF")
306             .put("UA", "EU")
307             .put("UG", "AF")
308             .put("UM", "OC")
309             .put("US", "NA")
310             .put("UY", "SA")
311             .put("UZ", "AS")
312             .put("VA", "EU")
313             .put("VC", "NA")
314             .put("VE", "SA")
315             .put("VG", "NA")
316             .put("VI", "NA")
317             .put("VN", "AS")
318             .put("VU", "OC")
319             .put("WF", "OC")
320             .put("WS", "OC")
321             .put("YE", "AS")
322             .put("YT", "AF")
323             .put("ZA", "AF")
324             .put("ZM", "AF")
325             .put("ZW", "AF")
326             .build();
327 
328     public static Optional<String> countryCodeToContinent(String countryCode) {
329         String continentCode = countryCodesToContinents.get(countryCode.toUpperCase());
330         if (continentCode != null) {
331             return Optional.ofNullable(continentCodes.get(continentCode));
332         }
333         return Optional.empty();
334     }
335 
336     /**
337      * Get the best description for a given language code.
338      *
339      * @param item         a described item
340      * @param priorDescOpt if the object is hierarchical, the parent-level
341      *                     description
342      * @param langCode     a 3-letter language code.
343      * @return the best matching description found
344      */
345     public static Optional<Description> getBestDescription(Described item, Optional<Description> priorDescOpt, String langCode) {
346         List<Description> descriptions = Lists.newArrayList(item.getDescriptions());
347         descriptions.sort(Comparator.comparing(Entity::getId));
348         Description fallBack = null;
349         for (Description description : descriptions) {
350             if (fallBack == null) {
351                 fallBack = description;
352             }
353             // First of all, check the description code (usually set to the
354             // EAD file ID.) If this is the same as the parent, return the
355             // current description.
356             for (Description parent : priorDescOpt.map(Collections::singleton).orElse(Collections.emptySet())) {
357                 for (String code : Optional.ofNullable(parent.getDescriptionCode())
358                         .map(Collections::singleton).orElse(Collections.emptySet())) {
359                     if (code.equals(description.getDescriptionCode())) {
360                         return Optional.of(description);
361                     }
362                 }
363             }
364 
365             // Otherwise, fall back to the first one with the same language
366             if (description.getLanguageOfDescription().equalsIgnoreCase(langCode)) {
367                 return Optional.of(description);
368             }
369         }
370         return Optional.ofNullable(fallBack);
371     }
372 
373     public static Optional<Description> getBestDescription(Described item, String langCode) {
374         return getBestDescription(item, Optional.empty(), langCode);
375     }
376 
377     /**
378      * Take an ISO-639-1 code or a language name and try and map to a valid ISO639-2 code.
379      *
380      * @param nameOrCode a language code or name to convert
381      * @return the ISO 639-2 language code for that code or name, or the input string if
382      * no conversion was found
383      */
384     public static String iso639DashTwoCode(String nameOrCode) {
385         if (nameOrCode.length() == 2 && locale2Map.containsKey(nameOrCode)) {
386             return locale2Map.get(nameOrCode).getISO3Language();
387         } else if (nameOrCode.length() == 3 && iso639BibTermLookup.containsKey(nameOrCode)) {
388             return iso639BibTermLookup.get(nameOrCode);
389         } else if (nameOrCode.length() > 3 && localeNameMap.containsKey(nameOrCode.toLowerCase())) {
390             return localeNameMap.get(nameOrCode.toLowerCase()).getISO3Language();
391             /* FIXME the localeNameMap depends on locale and translating an
392              * English name to a code fails when executed on
393              * e.g. a server with non-English locale
394              */
395         }
396         return nameOrCode;
397     }
398 
399     /**
400      * Take an ISO-639-2 code or a language name and try and map to a valid ISO639-1 code.
401      *
402      * @param nameOrCode a language code or name to convert
403      * @return the ISO 639-1 language code for that code or name, or the input string if
404      * no conversion was found
405      */
406     public static String iso639DashOneCode(String nameOrCode) {
407         if (nameOrCode.length() == 3 && locale3Map.containsKey(nameOrCode)) {
408             return locale3Map.get(nameOrCode);
409         } else if (nameOrCode.length() == 3 && iso639BibTermLookup.containsKey(nameOrCode)) {
410             return locale3Map.get(iso639BibTermLookup.get(nameOrCode));
411         } else if (nameOrCode.length() > 3 && localeNameMap.containsKey(nameOrCode.toLowerCase())) {
412             return localeNameMap.get(nameOrCode.toLowerCase()).getLanguage();
413         } else if (nameOrCode.length() > 2 && nameOrCode.contains("-")) {
414             // Attempt to handle codes like 'heb-Hebr' and 'eng-Latn'
415             List<String> parts = Lists.newArrayList(codeSplitter.split(nameOrCode));
416             if (parts.size() == 1) {
417                 return iso639DashOneCode(parts.get(0));
418             } else if (parts.size() == 2) {
419                 return iso639DashOneCode(parts.get(0)) + "-" + parts.get(1);
420             }
421         }
422         return nameOrCode;
423     }
424 
425     /**
426      * Convert a 2- or 3-letter language code to its (English) name.
427      * <p>
428      * If no name can be found the input code will be returned.
429      *
430      * @param code a 2- or 3-letter ISO639 code
431      * @return a language name, with the original code as a fall back
432      */
433     public static String codeToName(String code) {
434         if (code.length() == 2 && locale2Map.containsKey(code)) {
435             return locale2Map.get(code).getDisplayLanguage(Locale.ENGLISH);
436         } else if (code.length() == 3) {
437             String termCode = iso639BibTermLookup.containsKey(code)
438                     ? iso639BibTermLookup.get(code)
439                     : code;
440             String twoCode = locale3Map.get(termCode);
441             if (locale2Map.containsKey(twoCode)) {
442                 return locale2Map.get(twoCode).getDisplayLanguage(Locale.ENGLISH);
443             }
444         }
445         return code;
446     }
447 
448     /**
449      * Convert an ISO 3166-1 country code to its (English) name.
450      *
451      * @param code the 2-letter country code
452      * @return the country name, or the code as a fallback
453      */
454     public static String iso3166dashOneCodeToName(String code) {
455         return new Locale(Locale.ENGLISH.getLanguage(), code)
456                 .getDisplayCountry(Locale.ENGLISH);
457     }
458 
459     /**
460      * Convert and ISO639-2 code to its (English) name.
461      *
462      * @param code the 2-letter country code
463      * @return the country name
464      */
465     public static String countryCodeToName(String code) {
466         return new java.util.Locale(Locale.ENGLISH.getLanguage(), code)
467                 .getDisplayCountry();
468     }
469 }