View Javadoc

1   /*
2    * Copyright 2015 Data Archiving and Networked Services (an institute of
3    * Koninklijke Nederlandse Akademie van Wetenschappen), King's College London,
4    * Georg-August-Universitaet Goettingen Stiftung Oeffentlichen Rechts
5    *
6    * Licensed under the EUPL, Version 1.1 or – as soon they will be approved by
7    * the European Commission - subsequent versions of the EUPL (the "Licence");
8    * You may not use this work except in compliance with the Licence.
9    * You may obtain a copy of the Licence at:
10   *
11   * https://joinup.ec.europa.eu/software/page/eupl
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the Licence is distributed on an "AS IS" basis,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the Licence for the specific language governing
17   * permissions and limitations under the Licence.
18   */
19  
20  package eu.ehri.project.tools;
21  
22  import com.google.common.collect.Maps;
23  import com.google.common.collect.Sets;
24  import com.tinkerpop.frames.FramedGraph;
25  import eu.ehri.project.api.Api;
26  import eu.ehri.project.api.ApiFactory;
27  import eu.ehri.project.definitions.EventTypes;
28  import eu.ehri.project.definitions.Ontology;
29  import eu.ehri.project.exceptions.DeserializationError;
30  import eu.ehri.project.exceptions.PermissionDenied;
31  import eu.ehri.project.exceptions.ValidationError;
32  import eu.ehri.project.models.AccessPoint;
33  import eu.ehri.project.models.AccessPointType;
34  import eu.ehri.project.models.DocumentaryUnit;
35  import eu.ehri.project.models.DocumentaryUnitDescription;
36  import eu.ehri.project.models.EntityClass;
37  import eu.ehri.project.models.Link;
38  import eu.ehri.project.models.Repository;
39  import eu.ehri.project.models.base.Accessor;
40  import eu.ehri.project.models.base.Actioner;
41  import eu.ehri.project.models.cvoc.Concept;
42  import eu.ehri.project.models.cvoc.Vocabulary;
43  import eu.ehri.project.persistence.ActionManager;
44  import eu.ehri.project.persistence.Bundle;
45  import eu.ehri.project.utils.Slugify;
46  import org.slf4j.Logger;
47  import org.slf4j.LoggerFactory;
48  
49  import java.util.Map;
50  import java.util.Optional;
51  import java.util.Set;
52  
53  import static com.google.common.base.Preconditions.checkNotNull;
54  
55  
56  /**
57   * Utility class for performing operations on the database
58   * related to the creation and validation of links between
59   * items.
60   */
61  public class Linker {
62  
63      private static final Logger logger = LoggerFactory.getLogger(Linker.class);
64  
65      private static final String LINK_TYPE = "associative";
66      private static final String DEFAULT_LANG = "eng";
67  
68      private final FramedGraph<?> graph;
69      private final boolean tolerant;
70      private final boolean excludeSingles;
71      private final Set<AccessPointType> accessPointTypes;
72      private final String defaultLanguageCode;
73      private final Optional<String> logMessage;
74  
75      private Linker(FramedGraph<?> graph, Set<AccessPointType> accessPointTypes,
76              String defaultLanguageCode, Optional<String> logMessage,
77              boolean tolerant, boolean excludeSingles) {
78          this.graph = graph;
79          this.accessPointTypes = accessPointTypes;
80          this.defaultLanguageCode = defaultLanguageCode;
81          this.tolerant = tolerant;
82          this.excludeSingles = excludeSingles;
83          this.logMessage = logMessage;
84      }
85  
86      public Linker(FramedGraph<?> graph) {
87          this(graph, Sets.newHashSet(),
88                  DEFAULT_LANG, Optional.empty(), false, true);
89      }
90  
91      /**
92       * Populate a pre-created vocabulary with concepts created based on
93       * access points for all collections within a repository, then link
94       * those concepts to the relevant documentary units.
95       * <p>
96       * One creation event will be generated for the newly-created concepts
97       * (with the vocabulary as the scope) and another for the newly-created
98       * links. Currently events will still be created if no concepts/links
99       * are made.
100      * <p>
101      * It should be advised that this function is not idempotent and
102      * running it twice will generate concepts/links twice.
103      * <p>
104      * NB. One could argue this function does too much...
105      *
106      * @param repository the repository
107      * @param vocabulary an existing (presumably empty) vocabulary
108      * @param user       the user to whom to attribute the operation
109      * @return the number of new links created
110      */
111     public int createAndLinkRepositoryVocabulary(
112             Repository repository,
113             Vocabulary vocabulary,
114             Actioner user)
115             throws ValidationError, PermissionDenied {
116 
117         // First, build a map of access point names to (null) concepts
118         Map<String, String> conceptIdentifierNames = Maps.newHashMap();
119         Map<String, Optional<Concept>> identifierConcept = Maps.newHashMap();
120         Map<String, Integer> identifierCount = Maps.newHashMap();
121 
122         for (DocumentaryUnit doc : repository.getAllDocumentaryUnits()) {
123             for (DocumentaryUnitDescription description : doc.getDocumentDescriptions()) {
124                 for (AccessPoint relationship : description.getAccessPoints()) {
125                     if (accessPointTypes.isEmpty() || accessPointTypes
126                             .contains(relationship.getRelationshipType())) {
127                         String trimmedName = relationship.getName().trim();
128                         String identifier = getIdentifier(relationship);
129                         String prior = conceptIdentifierNames.get(identifier);
130                         if (identifier.isEmpty() || trimmedName.isEmpty()) {
131                             logger.warn("Ignoring empty access point name");
132                         } else if (prior != null && !prior.equals(trimmedName)) {
133                             logger.warn("Concept name/slug collision: '{}' -> '{}'", trimmedName,
134                                     prior);
135                         } else {
136                             conceptIdentifierNames.put(identifier, trimmedName);
137                             identifierConcept.put(identifier, Optional.empty());
138                             int count = identifierCount.containsKey(identifier)
139                                     ? identifierCount.get(identifier)
140                                     : 0;
141                             identifierCount.put(identifier, count + 1);
142                         }
143                     }
144                 }
145             }
146         }
147 
148         // Abort if we've got no concepts - this avoids creating
149         // an event unnecessarily...
150         if (!willCreateItems(identifierCount, excludeSingles)) {
151             return 0;
152         }
153 
154         // Now create concepts for all the names
155         ActionManager actionManager = new ActionManager(graph);
156         ActionManager.EventContext conceptEvent = actionManager
157                 .setScope(vocabulary)
158                 .newEventContext(user, EventTypes.creation, logMessage);
159         Api api = ApiFactory.noLogging(graph, user.as(Accessor.class));
160 
161         for (Map.Entry<String, String> idName : conceptIdentifierNames.entrySet()) {
162             String identifier = idName.getKey();
163             String name = idName.getValue();
164 
165             // if we're excluding "unique" access points, skip this...
166             if (identifierCount.get(identifier) < 2 && excludeSingles) {
167                 continue;
168             }
169 
170             Bundle conceptBundle = Bundle.Builder.withClass(EntityClass.CVOC_CONCEPT)
171                     .addDataValue(Ontology.IDENTIFIER_KEY, identifier)
172                     .addRelation(Ontology.DESCRIPTION_FOR_ENTITY, Bundle.Builder
173                             .withClass(EntityClass.CVOC_CONCEPT_DESCRIPTION)
174                             .addDataValue(Ontology.LANGUAGE_OF_DESCRIPTION, defaultLanguageCode)
175                             .addDataValue(Ontology.NAME_KEY, name)
176                             .build())
177                     .build();
178 
179             try {
180                 Concept concept = api.create(conceptBundle, Concept.class);
181                 concept.setVocabulary(vocabulary);
182                 identifierConcept.put(identifier, Optional.of(concept));
183                 conceptEvent.addSubjects(concept);
184             } catch (ValidationError validationError) {
185                 // If this happens it is most likely because two access points
186                 // slugified to the same name due to the removal of diacritics
187                 // etc. The createOrUpdate operation currently doesn't seem to
188                 // work in the same transaction (possibly due to the graph index
189                 // not being flushed), so for the moment we're just going to log
190                 // the error and continue.
191                 logger.warn("Id/name collision error: '{}' -> '{}' ('{}')", identifier, name,
192                         conceptIdentifierNames.get(identifier));
193                 logger.error("Link integrity error: ", validationError);
194                 if (!tolerant) {
195                     throw validationError;
196                 }
197             } catch (DeserializationError e) {
198                 throw new RuntimeException(e);
199             }
200         }
201 
202         conceptEvent.commit();
203 
204         // Now link the concepts with elements having the access point from
205         // which the concept originally derived.
206         ActionManager.EventContext linkEvent = actionManager
207                 .newEventContext(user, EventTypes.creation, logMessage);
208         int linkCount = 0;
209         for (DocumentaryUnit doc : repository.getAllDocumentaryUnits()) {
210             for (DocumentaryUnitDescription description : doc.getDocumentDescriptions()) {
211                 for (AccessPoint relationship : description.getAccessPoints()) {
212                     if (accessPointTypes.isEmpty() || accessPointTypes
213                             .contains(relationship.getRelationshipType())) {
214 
215                         String identifier = getIdentifier(relationship);
216                         // if we're excluding "unique" access points, skip this...
217                         if (identifierCount.get(identifier) < 2 && excludeSingles) {
218                             continue;
219                         }
220 
221                         Optional<Concept> conceptOpt = identifierConcept.get(identifier);
222                         try {
223                             if (conceptOpt != null && conceptOpt.isPresent()) {
224                                 Concept concept = conceptOpt.get();
225                                 Bundle linkBundle = Bundle.Builder.withClass(EntityClass.LINK)
226                                         .addDataValue(Ontology.LINK_HAS_TYPE, LINK_TYPE)
227                                         .build();
228                                 Link link = api.create(linkBundle, Link.class);
229                                 link.addLinkTarget(doc);
230                                 link.addLinkTarget(concept);
231                                 link.addLinkBody(relationship);
232                                 linkEvent.addSubjects(link);
233                                 linkCount++;
234                             }
235                         } catch (DeserializationError e) {
236                             throw new RuntimeException(e);
237                         }
238                     }
239                 }
240             }
241         }
242 
243         linkEvent.commit();
244 
245         return linkCount;
246     }
247 
248     /**
249      * Set the linker to ignore concepts which would only connect to a single
250      * item.
251      *
252      * @param excludeSingles a boolean value
253      * @return a new linker object
254      */
255     public Linker withExcludeSingles(boolean excludeSingles) {
256         return new Linker(graph, accessPointTypes, DEFAULT_LANG,
257                 logMessage, tolerant, excludeSingles);
258     }
259 
260     /**
261      * Set the linker to proceed even if there are integrity errors caused
262      * by two distinct concept names slugifying to the
263      * same string.
264      *
265      * @param tolerant a boolean value
266      * @return a new linker object
267      */
268     public Linker withTolerant(boolean tolerant) {
269         return new Linker(graph, accessPointTypes, DEFAULT_LANG,
270                 logMessage, tolerant, excludeSingles);
271     }
272 
273     /**
274      * Set the default language code to use for concept descriptions.
275      *
276      * @param defaultLanguageCode a three-letter ISO-639-2 code
277      * @return a new linker object
278      */
279     public Linker withDefaultLanguage(String defaultLanguageCode) {
280         return new Linker(graph, accessPointTypes, checkNotNull(defaultLanguageCode),
281                 logMessage, tolerant, excludeSingles);
282     }
283 
284     /**
285      * Set the log message for the created items.
286      *
287      * @param logMessage a descriptive string
288      * @return a new linker object
289      */
290     Linker withLogMessage(String logMessage) {
291         return new Linker(graph, accessPointTypes, checkNotNull(defaultLanguageCode),
292                 Optional.ofNullable(logMessage), tolerant, excludeSingles);
293     }
294 
295     /**
296      * Set the log message for the created items.
297      *
298      * @param logMessage a descriptive string
299      * @return a new linker object
300      */
301     public Linker withLogMessage(Optional<String> logMessage) {
302         return new Linker(graph, accessPointTypes, checkNotNull(defaultLanguageCode),
303                 checkNotNull(logMessage), tolerant, excludeSingles);
304     }
305 
306     /**
307      * Set the linker to include <b>only</b> the given access point types, discarding
308      * those already configured. If an empty list is given, all access point types
309      * will be included.
310      *
311      * @param accessPointTypes a list of access point types
312      * @return a new linker object
313      */
314     public Linker withAccessPointTypes(Set<AccessPointType> accessPointTypes) {
315         return new Linker(graph, Sets.newHashSet(checkNotNull(accessPointTypes)),
316                 defaultLanguageCode, logMessage, tolerant, excludeSingles);
317     }
318 
319     /**
320      * Set the linker to include the given access point type, in addition
321      * to those already configured.
322      *
323      * @param accessPointType an access point type string
324      * @return a new linker object
325      */
326     Linker withAccessPointType(AccessPointType accessPointType) {
327         Set<AccessPointType> tmp = Sets.newHashSet(checkNotNull(accessPointTypes));
328         tmp.add(accessPointType);
329         return new Linker(graph, tmp, defaultLanguageCode,
330                 logMessage, tolerant, excludeSingles);
331     }
332 
333     // Helpers...
334 
335     private static boolean willCreateItems(Map<String, Integer> identifierCounts, boolean excludeSingles) {
336         if (identifierCounts.isEmpty()) {
337             return false;
338         } else if (excludeSingles) {
339             Integer maxCount = 0;
340             for (Integer c : identifierCounts.values()) {
341                 if (c != null && c > maxCount) {
342                     maxCount = c;
343                 }
344             }
345             if (maxCount < 2) {
346                 return false;
347             }
348         }
349         return true;
350     }
351 
352     private static String getIdentifier(AccessPoint relationship) {
353         return Slugify.slugify(relationship.getName().trim())
354                 .replaceAll("^-+", "")
355                 .replaceAll("-+$", "");
356     }
357 }