libreccm-legacy/ccm-ldn-aplaws/src/com/arsdigita/aplaws/AutoCategorisation.java

689 lines
30 KiB
Java
Executable File

package com.arsdigita.aplaws;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.math.BigDecimal;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.StringTokenizer;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.rpc.ParameterMode;
import org.apache.axis.Constants;
import org.apache.axis.client.Call;
import org.apache.axis.client.Service;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.OptionBuilder;
import org.apache.log4j.Logger;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import com.arsdigita.aplaws.ui.ItemCategoryPicker;
import com.arsdigita.categorization.Category;
import com.arsdigita.cms.ContentBundle;
import com.arsdigita.cms.ContentItem;
import com.arsdigita.cms.ContentSection;
import com.arsdigita.cms.Folder;
import com.arsdigita.cms.dispatcher.ItemResolver;
import com.arsdigita.domain.DataObjectNotFoundException;
import com.arsdigita.domain.DomainCollection;
// import com.arsdigita.domain.DomainObjectFactory;
import com.arsdigita.domain.DomainServiceInterfaceExposer;
import com.arsdigita.kernel.Kernel;
import com.arsdigita.kernel.KernelExcursion;
import com.arsdigita.london.cms.dublin.DublinCoreItem;
import com.arsdigita.london.navigation.Navigation;
import com.arsdigita.london.navigation.NavigationFileResolver;
import com.arsdigita.london.terms.Domain;
import com.arsdigita.london.terms.Term;
import com.arsdigita.london.util.Transaction;
import com.arsdigita.persistence.DataAssociation;
import com.arsdigita.persistence.DataAssociationCursor;
import com.arsdigita.persistence.DataCollection;
import com.arsdigita.persistence.DataObject;
import com.arsdigita.persistence.Filter;
// import com.arsdigita.persistence.OID;
import com.arsdigita.persistence.SessionManager;
import com.arsdigita.search.ContentProvider;
import com.arsdigita.search.ContentType;
import com.arsdigita.search.MetadataProvider;
import com.arsdigita.search.MetadataProviderRegistry;
import com.arsdigita.web.Application;
/**
* Command line tool to automatically assign terms.
*
* Input is a MASmedia Searchlight Indexer report XML file.
* This service is available for registered users
* at http://demo.masprovider.com/searchLight/
*
* @author apevec@redhat.com
*/
public class AutoCategorisation extends com.arsdigita.packaging.Program {
private static final Logger LOG = Logger.getLogger(AutoCategorisation.class);
private static final String CCM_PREFIX = "/ccm/";
private SAXParser parser;
public AutoCategorisation() {
super("AutoCategorisation", "1.0", "FILENAME(Searchlight XML report)");
getOptions().addOption
(OptionBuilder
.hasArg(false)
.withLongOpt("test")
.withDescription("Test only")
.create('t'));
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (ParserConfigurationException pce) {
throw new RuntimeException("SAX parser configuration error", pce);
} catch (SAXException se) {
throw new RuntimeException("SAX parser error", se);
}
}
protected void doRun(final CommandLine cmdLine) {
new Transaction() {
public void doRun() {
new KernelExcursion() {
public void excurse() {
setEffectiveParty(Kernel.getSystemParty());
String[] args = cmdLine.getArgs();
if (args.length == 1) {
String filename = args[0];
try {
boolean persistChanges = ! cmdLine.hasOption('t');
parser.parse(new InputSource(new FileReader(filename)),
new SearchlightHandler(isDebug(), isVerbose(), persistChanges));
} catch (SAXException ex) {
throw new RuntimeException(ex);
} catch (IOException ex) {
throw new RuntimeException(ex);
}
} else {
help(System.err);
System.exit(1);
}
}
}.run();
}
}.run();
}
public static void main(String[] args) {
new AutoCategorisation().run(args);
}
/**
* Parse the XML site report from MASmedia Searchlight Indexer.
*/
private static class SearchlightHandler extends DefaultHandler {
boolean isDebug;
boolean isVerbose;
boolean persistChanges;
StringBuffer buffer;
String urlid;
String url;
ContentItem item;
String scheme;
Collection keywords = new HashSet();
Collection terms = new HashSet();
Domain ipsv, lgcl, gcl, lgsl, lgdl;
public SearchlightHandler(boolean isDebug, boolean isVerbose, boolean persistChanges) {
this.isDebug = isDebug;
this.isVerbose = isVerbose;
this.persistChanges = persistChanges;
ipsv = Domain.retrieve("IPSV");
lgcl = Domain.retrieve("LGCL");
gcl = Domain.retrieve("GCL");
lgsl = Domain.retrieve("LGSL");
lgdl = Domain.retrieve("LGDL");
}
public void startDocument() throws SAXException {
if (isDebug) {
out("startDoc");
}
}
public void characters(char[] ch, int start, int len)
throws SAXException {
for (int i = 0; i < len; i++) {
buffer.append(ch[start + i]);
}
}
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
if (isDebug) {
out("startElement " + qName);
}
buffer = new StringBuffer();
if ("url".equals(qName)) {
urlid = null;
url = null;
item = null;
} else if ("unformated_data".equals(qName)) {
keywords.clear();
terms.clear();
scheme = null;
} else if ("keyword_data".equals(qName)) {
scheme = attributes.getValue("scheme");
} else if ("category_data".equals(qName)) {
scheme = attributes.getValue("scheme");
} // if qName
}
// XXX term@id is *NOT* Term.uniqueID, use unformated_data/category_data
// and retrieve terms by name
/* example from CAMDEN.xml
<tags>
<terms>
<term score="40" thesarus="LGTL" id="9087">A to Z</term>
<term score="6" thesarus="LGCS" id="8030">Development control</term>
<term score="6" thesarus="LGSL" id="10213">Development control</term>
<term score="6" thesarus="IPSV" id="12738">Development control</term>
<term score="6" thesarus="IPSV" id="11440">Domestic violence</term>
<term score="6" thesarus="IPSV" id="11695">Planning (town and country)</term>
</terms>
<category><![CDATA[<meta name="eGMS.subject.category" scheme="LGCS" content="Development Control" />]]><![CDATA[<meta name="eGMS.subject.category" scheme="LGSL" content="Development Control" />]]><![CDATA[<meta name="eGMS.subject.category" scheme="IPSV" content="Domestic violence; Planning (town and country)" />]]></category>
<keyword><![CDATA[<meta name="eGMS.subject.keyword" scheme="LGTL" content="A to Z" />]]><![CDATA[<meta name="eGMS.subject.keyword" scheme="IPSV" content="Development Control" />]]></keyword>
<unformated_data>
<keyword_data scheme="LGTL">A to Z</keyword_data>
<keyword_data scheme="IPSV">Development Control</keyword_data>
<category_data scheme="LGCS">Development Control</category_data>
<category_data scheme="LGSL">Development Control</category_data>
<category_data scheme="IPSV">Domestic violence; Planning (town and country)</category_data>
</unformated_data>
</tags>
*/
@Override
public void endElement(String uri, String localName, String qName)
throws SAXException {
if (isDebug) {
out("endElement " + qName);
}
if ("urlid".equals(qName)) {
urlid = buffer.toString();
if (isVerbose) {
out("urlid "+urlid);
}
} else if ("urladdress".equals(qName)) {
findItem();
if (item != null && isVerbose) {
out("item " + item + " at " + url);
}
} else if ("keyword_data".equals(qName)) {
// merge all keywords
StringTokenizer tok = new StringTokenizer(buffer.toString(),
";");
while (tok.hasMoreTokens()) {
keywords.add(tok.nextToken().trim());
}
} else if ("category_data".equals(qName)) {
// use only IPSV terms
if ("IPSV".equals(scheme)) {
StringTokenizer tok = new StringTokenizer(
buffer.toString(), ";");
while (tok.hasMoreTokens()) {
String name = tok.nextToken().trim();
Term term = findTerm(scheme, name);
if (term != null) {
terms.add(term);
} else if (isVerbose) {
out("term not found " + scheme + '/' + name);
}
}
}
} else if ("unformated_data".equals(qName)) {
assignKeywords();
assignTerms();
} // if qName
}
@Override
public void endDocument() throws SAXException {
if (isDebug) {
out("endDoc");
}
}
private void findItem() {
url = buffer.toString();
// resolve url to the item
// supported are Navigation and ContentSection URLs
int ccmPrefix = url.indexOf(CCM_PREFIX);
if (ccmPrefix > -1) {
int appBegin = ccmPrefix + 5;
int appEnd = url.indexOf('/', appBegin);
if (appEnd > appBegin) {
String appURL = url.substring(appBegin, appEnd);
Application app = Application
.retrieveApplicationForPath('/' + appURL + '/');
if (app != null) {
String appType = app.getApplicationType()
.getApplicationObjectType();
if (ContentSection.BASE_DATA_OBJECT_TYPE
.equals(appType)) {
// a Content Section URL detected, resolving
// item path
ContentSection cs = (ContentSection) app;
ItemResolver resolver = cs.getItemResolver();
if (resolver != null) {
int queryBegin = url.indexOf('?', appEnd);
if (queryBegin > appEnd) {
// MPA URLs can have ?page=N which confuses
// c.a.cms.d.MLIR
url = url.substring(0, queryBegin);
}
item = resolver.getItem(cs, url
.substring(appEnd), ContentItem.LIVE);
if (item != null) {
item = item.getDraftVersion();
}
// make sure we return real content item,
// and not structure (folder or bundle)
if (item != null && item instanceof Folder) {
item = ((Folder) item).getIndexItem();
}
if (item != null
&& item instanceof ContentBundle) {
item = ((ContentBundle) item)
.getPrimaryInstance();
}
if (item == null) {
out("item not found for " + url);
}
} else {
if (isVerbose) {
out("ContentSection without resolver");
}
}
} else if (Navigation.BASE_DATA_OBJECT_TYPE
.equals(appType)) {
// a Navigation URL detected, resolving category
Navigation nav = (Navigation) app;
Category cat = null;
// categoryID=
int catBegin = url.indexOf("categoryID=",
appEnd);
if (catBegin > appEnd) {
BigDecimal catID = new BigDecimal(url
.substring(catBegin + 11));
cat = new Category(catID);
} else {
// named cat path, resolve using
// default context XXX subsites?
Category root = Category.getRootForObject(
nav, null);
Category[] cats = NavigationFileResolver
.resolveCategory(root, url
.substring(appEnd));
if (cats != null && cats.length != 0) {
cat = cats[cats.length - 1];
} else {
if (isVerbose) {
out("category path not found " + url);
}
}
}
// category index item
if (cat != null) {
ContentBundle bundle = (ContentBundle) cat.getIndexObject();
if (bundle != null) {
item = bundle.getPrimaryInstance();
}
}
} else {
if (isVerbose) {
out("unsupported application " + appType + " at " + url);
}
}
} else {
if (isVerbose) {
out("application not found " + url);
}
}
} else {
if (isVerbose) {
out("unsupported CCM url " + url);
}
}
} else {
if (isVerbose) {
out("unsupported url " + url);
}
}
}
private void assignTerms() {
if (item != null) {
ContentBundle bundle = (ContentBundle) item.getParent();
Collection manualCategories = new HashSet();
Collection oldAutoCategories = new HashSet();
Collection newAutoCategories = new LinkedList();
DataAssociationCursor cursor = ((DataAssociation)DomainServiceInterfaceExposer
.get(bundle, "categories")).cursor();
// cat_object_category_map.auto_p
// cursor.addEqualsFilter("link.isAuto", Boolean.FALSE);
while (cursor.next()) {
Object categoryID = cursor.get("id");
Boolean isAuto = (Boolean) cursor.getLinkProperty("isAuto");
if (isAuto.booleanValue()) {
oldAutoCategories.add(categoryID);
} else {
manualCategories.add(categoryID);
}
}
// assign all new auto-derived terms, unless it's already assigned
// prefer manual IPSV/LGCL over auto IPSV
// check existing manual terms
Collection manualIPSV = new LinkedList();
Collection manualLGCL = new LinkedList();
if (!manualCategories.isEmpty()) {
DataCollection dc = SessionManager.getSession().retrieve(Term.BASE_DATA_OBJECT_TYPE);
Filter f = dc.addFilter("model.id IN :manualCats");
f.set("manualCats", manualCategories);
dc.addFilter("domain.key IN ('IPSV','LGCL')");
dc.addPath("domain.key");
dc.addPath("model.id");
while (dc.next()) {
String domainKey = (String)dc.get("domain.key");
if ("IPSV".equals(domainKey)) {
manualIPSV.add(dc.get("model.id"));
} else if ("LGCL".equals(domainKey)) {
manualLGCL.add(dc.get("model.id"));
}
}
}
boolean canAutoIPSV = true;
boolean canAutoLGCL = true;
if (manualLGCL.isEmpty() && !manualIPSV.isEmpty()) {
// auto-assign related LGCL from manual IPSV
if (isVerbose) {
out("derive from manual IPSV");
}
// TODO move getRelatedTerms etc. out of UI code
Collection relatedLGCL = ItemCategoryPicker
.getRelatedTerms(manualIPSV, lgcl);
if (!relatedLGCL.isEmpty()) {
ItemCategoryPicker.assignTerms(relatedLGCL, bundle);
canAutoLGCL = false;
}
}
if (manualIPSV.isEmpty() && !manualLGCL.isEmpty()) {
// auto-assign related IPSV from manual LGCL
if (isVerbose) {
out("derive from manual LGCL");
}
Collection relatedIPSV = ItemCategoryPicker
.getRelatedTerms(manualLGCL, ipsv);
if (!relatedIPSV.isEmpty()) {
ItemCategoryPicker.assignTerms(relatedIPSV, bundle);
canAutoIPSV = false;
}
}
// auto assign IPSV/LGCL
for (Iterator iter = terms.iterator(); iter.hasNext();) {
Term t = (Term) iter.next();
Domain d = t.getDomain();
if (canAutoIPSV && ipsv.equals(d) || canAutoLGCL
&& lgcl.equals(d)) {
BigDecimal categoryID = t.getModel().getID();
if ( !manualCategories.contains(categoryID)) {
if (!oldAutoCategories.contains(categoryID)) {
if (persistChanges) {
t.addObject(bundle);
}
newAutoCategories.add(categoryID); // to be marked isAuto
if (isVerbose) {
out("autoASSIGN " + t + " to " + bundle);
}
} else {
oldAutoCategories.remove(categoryID);
if (isVerbose) {
out("already auto assigned " + t);
}
}
} else if (isVerbose) {
out("already manually assigned " + t);
}
}
}
// cleanup old auto-assigned terms
for (Iterator iter = oldAutoCategories.iterator(); iter
.hasNext();) {
Category category = new Category((BigDecimal) iter.next());
if (persistChanges) {
category.removeChild(bundle);
}
if (isVerbose) {
out("removing oldAuto " + category + " from " + bundle);
}
}
if (!newAutoCategories.isEmpty()) {
cursor = ((DataAssociation) DomainServiceInterfaceExposer
.get(bundle, "categories")).cursor();
Filter f = cursor.addFilter("id IN :newAutoCats");
f.set("newAutoCats", newAutoCategories);
while (cursor.next()) {
Object categoryID = cursor.get("id");
if (persistChanges) {
DataObject link = cursor.getLink();
link.set("isAuto", Boolean.TRUE);
}
if (isVerbose) {
out("isAuto=TRUE for new categoryID=" + categoryID
+ "/" + bundle);
}
}
}
// TODO assign related GCL, LGSL, LGDL - move that code out of
// UI (ItemCategoryPicker)
}
}
private void assignKeywords() {
if (item != null && !keywords.isEmpty()) {
DublinCoreItem dcItem = DublinCoreItem.findByOwner(item);
if (dcItem != null) {
// preserve existing dcItem.getKeywords()
// NOTE: "DC keywords" metadata is stored as a string, cannot tell which keywords are auto.
String dcKeywords = dcItem.getKeywords();
if (dcKeywords != null) {
StringTokenizer tok = new StringTokenizer(dcKeywords, ";");
while (tok.hasMoreTokens()) {
keywords.add(tok.nextToken().trim());
}
}
}
StringBuffer buf = new StringBuffer();
// reconstruct "DC keywords" and store them
Iterator i=keywords.iterator();
if (i.hasNext()) {
buf.append(i.next());
}
for (; i.hasNext();) {
buf.append(';').append(' ').append(i.next());
}
String dcKeywords = buf.toString();
if (isVerbose) {
out("ASSIGN DC keywords \""+dcKeywords+"\"");
}
if (persistChanges) {
if (dcItem == null) {
dcItem = DublinCoreItem.create(item);
}
dcItem.setKeywords(dcKeywords);
}
}
}
}
private static void out(String line) {
LOG.info(line);
}
private static Term findTerm(String domainKey, String name) {
try {
Domain domain = Domain.retrieve(domainKey);
if (domain != null) {
DomainCollection terms = domain.getTerms();
Filter f = terms.addFilter("upper("+Term.NAME+") = :name");
f.set("name", name.toUpperCase());
if (terms.next()) {
Term term = (Term) terms.getDomainObject();
terms.close();
return term;
}
}
} catch (DataObjectNotFoundException donfe) {
// domain not found
}
return null;
}
/**
* Call Searchlight web service.
* Process service response and try to map detected categories to TermS.
*
* @param item content item
* @return a list of suggested TermS derived from the text extracted
* from the given content item
*/
public static Collection getAutoTerms(ContentItem item) throws ServiceFailed {
Collection terms = new HashSet();
try {
// text extraction, see c.a.search.lucene.DocumentObserver
MetadataProvider adapter = MetadataProviderRegistry
.findAdapter(item.getObjectType());
if (adapter != null) {
ContentProvider[] content = adapter.getContent(item,
ContentType.TEXT);
StringBuffer buf = new StringBuffer();
for (int i = 0, n = content.length; i < n; i++) {
if (content[i].getType().equals(ContentType.TEXT)) {
buf.append(new String(content[i].getBytes()));
}
}
String endpoint = Aplaws.getAplawsConfig().getAutocatServiceURL();
String version = "1.0";
Service service = new Service();
Call call = (Call) service.createCall();
call.setTargetEndpointAddress(new java.net.URL(endpoint));
call.setOperationName("getCategoriesAndKeywordsFromText");
call.addParameter("text", Constants.XSD_STRING,
ParameterMode.IN);
call.addParameter("version", Constants.XSD_STRING,
ParameterMode.IN);
call.setUsername(Aplaws.getAplawsConfig()
.getAutocatServiceUsername());
call.setPassword(Aplaws.getAplawsConfig()
.getAutocatServicePassword());
call.setReturnType(Constants.XSD_STRING);
LOG.debug("username/password="+call.getUsername()+"/"+call.getPassword());
String result = (String) call.invoke(new Object[] {
buf.toString(), version });
// parse the response from autocat service
SAXParser parser;
parser = SAXParserFactory.newInstance().newSAXParser();
parser.parse(new InputSource(new StringReader(result)),
new SearchlightServiceHandler(terms));
} else {
LOG.info("no metadata adapter for " + item);
}
} catch (Exception e) {
LOG.info("autocat service call failed", e);
throw new ServiceFailed(e);
}
return terms;
}
// example autocat service response:
/*<searchlight:textminer
* xmlns:searchlight="http://www.cintra.com/MASmedia/Searchlight">
* <searchlight:results>
* <searchlight:category-matches>
* <searchlight:category-match score="11">
* <searchlight:category>Community Safety</searchlight:category>
* <searchlight:thesaurus-ref thesaurus="LGCS" version="0.01" id="182" />
* <searchlight:thesaurus-ref thesaurus="LGSL" version="2.02" id="870" />
* <searchlight:thesaurus-ref thesaurus="IPSV" version="2.00" id="6280" />
* </searchlight:category-match>
* ...
*/
private static class SearchlightServiceHandler extends DefaultHandler {
StringBuffer buffer;
String category = null;
String thesaurus = null;
String score = null;
Collection terms;
public SearchlightServiceHandler(Collection terms) {
this.terms = terms;
}
public void characters(char[] ch, int start, int len)
throws SAXException {
for (int i = 0; i < len; i++) {
buffer.append(ch[start + i]);
}
}
public void startElement(String uri, String localName, String qName,
Attributes attributes) throws SAXException {
buffer = new StringBuffer();
if ("searchlight:category-match".equals(qName)) {
score = attributes.getValue("score");
} else if (category != null && score != null
&& "searchlight:thesaurus-ref".equals(qName)) {
thesaurus = attributes.getValue("thesaurus");
if ("IPSV".equals(thesaurus)) {
LOG.debug("IPSV "+category);
Term t = findTerm(thesaurus, category);
if (t != null) {
terms.add(t);
LOG.debug("term = "+t);
} else {
LOG.debug("term not found");
}
}
}
}
public void endElement(String uri, String localName, String qName)
throws SAXException {
if ("searchlight:category".equals(qName)) {
category = buffer.toString();
} else if ("searchlight:category-match".equals(qName)) {
category = null; score = null; thesaurus = null;
} else if ("searchlight:category-matches".equals(qName)) {
LOG.debug("terms autodetected "+terms);
}
}
}
public static class ServiceFailed extends RuntimeException {
public ServiceFailed(Throwable cause) {
super(cause);
}
}
}