Commit a4fdfc6a authored by tssasha's avatar tssasha
Browse files

contexts are working!

parent 8ffd7454
......@@ -58,8 +58,8 @@ import org.apache.lucene.util.fst.FST.BytesReader;
import org.apache.lucene.util.fst.PairOutputs.Pair;
//import org.apache.lucene.util.fst.Util.Result;
//import org.apache.lucene.util.fst.Util.TopResults;
import org.apache.lucene.util.fst.UtilDebug.Result;
import org.apache.lucene.util.fst.UtilDebug.TopResults;
import org.apache.lucene.util.fst.Util.Result;
import org.apache.lucene.util.fst.Util.TopResults;
import static org.apache.lucene.util.automaton.Operations.DEFAULT_MAX_DETERMINIZED_STATES;
......@@ -386,7 +386,7 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
@Override
public void build(InputIterator iterator) throws IOException {
System.out.println("bpoint1");
//System.out.println("bpoint1");
hasPayloads = iterator.hasPayloads();
OfflineSorter sorter = new OfflineSorter(tempDir, tempFileNamePrefix, new AnalyzingComparator(hasPayloads));
......@@ -404,7 +404,7 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
count = 0;
byte buffer[] = new byte[8];
try {
System.out.println("bpoint2");
//System.out.println("bpoint2");
ByteArrayDataOutput output = new ByteArrayDataOutput(buffer);
for (BytesRef surfaceForm; (surfaceForm = iterator.next()) != null;) {
......@@ -412,7 +412,7 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
new LimitedFiniteStringsIterator(toAutomaton(surfaceForm, ts2a), maxGraphExpansions);
for (IntsRef string; (string = finiteStrings.next()) != null; count++) {
UtilDebug.toBytesRef(string, scratch);
Util.toBytesRef(string, scratch);
// length of the analyzed text (FST input)
if (scratch.length() > Short.MAX_VALUE-2) {
......@@ -427,11 +427,11 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
BytesRef payload;
if (hasPayloads) {
System.out.println("bpoint3");
//System.out.println("bpoint3");
if (surfaceForm.length > (Short.MAX_VALUE-2)) {
throw new IllegalArgumentException("cannot handle surface form > " + (Short.MAX_VALUE-2) + " in length (got " + surfaceForm.length + ")");
}
System.out.println("bpoint4");
//System.out.println("bpoint4");
payload = iterator.payload(); //----------------------------------?
// payload + surfaceLength (short)
requiredLength += payload.length + 2;
......@@ -455,7 +455,7 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
throw new IllegalArgumentException("surface form cannot contain unit separator character U+001F; this character is reserved");
}
}
System.out.println("bpoint5");
//System.out.println("bpoint5");
output.writeShort((short) surfaceForm.length);
output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
output.writeBytes(payload.bytes, payload.offset, payload.length);
......@@ -552,7 +552,7 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
analyzed.append((byte) 0);
analyzed.append((byte) dedup);
UtilDebug.toIntsRef(analyzed.get(), scratchInts);
Util.toIntsRef(analyzed.get(), scratchInts);
//System.out.println("ADD: " + scratchInts + " -> " + cost + ": " + surface.utf8ToString());
if (!hasPayloads) {
fstCompiler.add(scratchInts.get(), outputs.newPair(cost, BytesRef.deepCopyOf(surface)));
......@@ -568,7 +568,7 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
}
}
fst = fstCompiler.finish();
System.out.println("bpoint6");
//System.out.println("bpoint6");
//Util.dotToFile(fst, "/tmp/suggest.dot");
} finally {
IOUtils.closeWhileHandlingException(reader, writer);
......@@ -644,17 +644,17 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
System.out.println("Iaminlookup");
// System.out.println("Iaminlookup");
assert num > 0;
System.out.println("point-3");
// System.out.println("point-3");
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
System.out.println("point-2");
// System.out.println("point-2");
if (fst == null) {
return Collections.emptyList();
}
System.out.println("point-1");
// System.out.println("point-1");
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == 0x1E) {
......@@ -665,9 +665,9 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
}
}
final BytesRef utf8Key = new BytesRef(key);
System.out.println("point0");
// System.out.println("point0");
try {
System.out.println("point1");
// System.out.println("point1");
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRefBuilder spare = new CharsRefBuilder();
......@@ -689,7 +689,7 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
System.out.println("point2");
// System.out.println("point2");
int count = 0;
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
......@@ -701,36 +701,41 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
// Searcher just to find the single exact only
// match, if present:
UtilDebug.TopNSearcher<Pair<Long,BytesRef>> searcher;
System.out.println("point3");
System.out.format("topN: %d", count * maxSurfaceFormsPerAnalyzedForm);
searcher = new UtilDebug.TopNSearcher<Pair<Long, BytesRef>>(fst,
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
// System.out.println("point3");
searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst,
count * maxSurfaceFormsPerAnalyzedForm,
count * maxSurfaceFormsPerAnalyzedForm, weightComparator) {
@Override
protected boolean acceptResult(IntsRef input, PairOutputs.Pair<Long, BytesRef> output) {
System.out.println("point4");
// System.out.println("point4");
LookupResult result = getLookupResult(output.output1, output.output2, spare);
BytesRef res_context = result.payload;
System.out.println("res_context1 = " + res_context);
// System.out.println("res_context1 = " + res_context);
if (contexts == null) {
return false;
return true;
}
for (BytesRef context : contexts) {
System.out.println("context1 = " + context);
System.out.println("context 1 = " + context);
}
if (!(contexts.contains(res_context))) {
return false;
}
System.out.println("contains 1");
// System.out.println("contains 1");
return true;
}
};
System.out.println("point5");
// searcher =
// new UtilDebug.TopNSearcher<>(
// fst,
// count * maxSurfaceFormsPerAnalyzedForm,
// count * maxSurfaceFormsPerAnalyzedForm,
// weightComparator);
// System.out.println("point5");
// NOTE: we could almost get away with only using
// the first start node. The only catch is if
// maxSurfaceFormsPerAnalyzedForm had kicked in and
......@@ -740,7 +745,7 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
System.out.println("add_point");
// System.out.println("add_point");
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output()), false, path.input);
}
}
......@@ -774,27 +779,33 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
}
}
UtilDebug.TopNSearcher<Pair<Long,BytesRef>> searcher;
System.out.println("point6");
System.out.format("topN: %d ", num - results.size());
searcher = new UtilDebug.TopNSearcher<Pair<Long, BytesRef>>(fst,
Util.TopNSearcher<Pair<Long,BytesRef>> searcher;
// System.out.println("point6");
// System.out.format("topN: %d ", num - results.size());
searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst,
num - results.size(),
num * maxAnalyzedPathsForOneInput,
weightComparator) {
private final Set<BytesRef> seen = new HashSet<>();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long,BytesRef> output) {
System.out.println("point7");
// System.out.println("point7");
LookupResult result = getLookupResult(output.output1, output.output2, spare);
BytesRef res_context = result.payload;
System.out.println("res_context2 = " + res_context);
// System.out.println("res_context2 = " + res_context);
// for (BytesRef context : contexts) {
// System.out.println(context);
// }
if (contexts == null) {
return false;
System.out.println("null");
return true;
}
for (BytesRef context : contexts) {
System.out.println("context2 = " + context);
System.out.println("context 2 = " + context);
}
System.out.println("payload = " + res_context);
if (!(contexts.contains(res_context))) {
return false;
}
......@@ -823,7 +834,34 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
}
}
};
System.out.println("point8");
// @Override
// protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
//
// // Dedup: when the input analyzes to a graph we
// // can get duplicate surface forms:
// if (seen.contains(output.output2)) {
// return false;
// }
// seen.add(output.output2);
//
// if (!exactFirst) {
// return true;
// } else {
// // In exactFirst mode, don't accept any paths
// // matching the surface form since that will
// // create duplicate results:
// if (sameSurfaceForm(utf8Key, output.output2)) {
// // We found exact match, which means we should
// // have already found it in the first search:
// assert results.size() == 1;
// return false;
// } else {
// return true;
// }
// }
// }
// };
// System.out.println("point8");
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path<Pair<Long,BytesRef>> path : prefixPaths) {
......@@ -844,13 +882,13 @@ public class ContextAnalyzingSuggester extends Lookup implements Accountable {
results.add(result);
if (results.size() == num) {
System.out.println("point9");
// System.out.println("point9");
// In the exactFirst=true case the search may
// produce one extra path
break;
}
}
System.out.println("point10");
// System.out.println("point10");
return results;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
......
......@@ -417,7 +417,7 @@ public final class UtilDebug {
}
public TopResults<T> search() throws IOException {
System.out.println("upoint0");
//System.out.println("upoint0");
final List<Result<T>> results = new ArrayList<>();
final BytesReader fstReader = fst.getBytesReader();
......@@ -433,7 +433,7 @@ public final class UtilDebug {
// For each top N path:
while (results.size() < topN) {
System.out.println("upoint1");
//System.out.println("upoint1");
FSTPath<T> path;
if (queue == null) {
......@@ -445,7 +445,7 @@ public final class UtilDebug {
while(it.hasNext()){
System.out.println(it.next().toString());
}
System.out.println("upoint2");
// System.out.println("upoint2");
// Remove top path since we are now going to
// pursue it:
path = queue.pollFirst();
......@@ -456,11 +456,11 @@ public final class UtilDebug {
}
//System.out.println("pop path=" + path + " arc=" + path.arc.output);
System.out.println("upoint3");
// System.out.println("upoint3");
if (acceptPartialPath(path) == false) {
continue;
}
System.out.println("upoint4");
// System.out.println("upoint4");
if (path.arc.label() == FST.END_LABEL) {
// Empty string!
......@@ -468,7 +468,7 @@ public final class UtilDebug {
results.add(new Result<>(path.input.get(), path.output));
continue;
}
System.out.println("upoint5");
// System.out.println("upoint5");
if (results.size() == topN-1 && maxQueueDepth == topN) {
// Last path -- don't bother w/ queue anymore:
......@@ -483,7 +483,7 @@ public final class UtilDebug {
// For each input letter:
while (true) {
System.out.println("upoint6");
// System.out.println("upoint6");
fst.readFirstTargetArc(path.arc, path.arc, fstReader);
......@@ -491,13 +491,13 @@ public final class UtilDebug {
boolean foundZero = false;
boolean arcCopyIsPending = false;
while(true) {
System.out.println("upoint7");
// System.out.println("upoint7");
// tricky: instead of comparing output == 0, we must
// express it via the comparator compare(output, 0) == 0
if (comparator.compare(NO_OUTPUT, path.arc.output()) == 0) {
System.out.println("upoint8");
// System.out.println("upoint8");
if (queue == null) {
System.out.println("upoint9");
// System.out.println("upoint9");
foundZero = true;
break;
} else if (!foundZero) {
......@@ -510,7 +510,7 @@ public final class UtilDebug {
addIfCompetitive(path);
}
if (path.arc.isLast()) {
System.out.println("upoint10");
// System.out.println("upoint10");
break;
}
if (arcCopyIsPending) {
......@@ -519,7 +519,7 @@ public final class UtilDebug {
}
fst.readNextArc(path.arc, fstReader);
}
System.out.println("upoint11");
// System.out.println("upoint11");
assert foundZero;
......@@ -528,7 +528,7 @@ public final class UtilDebug {
}
if (path.arc.label() == FST.END_LABEL) {
System.out.println("upoint12");
// System.out.println("upoint12");
// Add final output:
path.output = fst.outputs.add(path.output, path.arc.output());
if (acceptResult(path)) {
......@@ -538,17 +538,17 @@ public final class UtilDebug {
}
break;
} else {
System.out.println("upoint13");
// System.out.println("upoint13");
path.input.append(path.arc.label());
path.output = fst.outputs.add(path.output, path.arc.output());
if (acceptPartialPath(path) == false) {
System.out.println("upoint14");
// System.out.println("upoint14");
break;
}
}
}
}
System.out.println("upoint15");
// System.out.println("upoint15");
return new TopResults<>(rejectCount + topN <= maxQueueDepth, results);
}
......
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling.suggest;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.suggest.Lookup;
import org.apache.lucene.search.suggest.Lookup.LookupResult;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.CloseHook;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.update.SolrCoreState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.common.params.CommonParams.NAME;
import static org.apache.solr.spelling.suggest.fst.AnalyzingInfixLookupFactory.CONTEXTS_FIELD_NAME;
/**
* Responsible for loading the lookup and dictionary Implementations specified by
* the SolrConfig.
* Interacts (query/build/reload) with Lucene Suggesters through {@link Lookup} and
* {@link Dictionary}
* */
public class MySolrSuggester implements Accountable {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/** Name used when an unnamed suggester config is passed */
public static final String DEFAULT_DICT_NAME = "default";
/** Location of the source data - either a path to a file, or null for the
* current IndexReader.
* */
public static final String LOCATION = "sourceLocation";
/** Fully-qualified class of the {@link Lookup} implementation. */
public static final String LOOKUP_IMPL = "lookupImpl";
/** Fully-qualified class of the {@link Dictionary} implementation */
public static final String DICTIONARY_IMPL = "dictionaryImpl";
/**
* Name of the location where to persist the dictionary. If this location
* is relative then the data will be stored under the core's dataDir. If this
* is null the storing will be disabled.
*/
public static final String STORE_DIR = "storeDir";
static SuggesterResult EMPTY_RESULT = new SuggesterResult();
private String sourceLocation;
private File storeDir;
private Dictionary dictionary;
private Lookup lookup;
private String lookupImpl;
private String dictionaryImpl;
private String name;
private LookupFactory factory;
private DictionaryFactory dictionaryFactory;
private Analyzer contextFilterQueryAnalyzer;
/**
* Uses the <code>config</code> and the <code>core</code> to initialize the underlying
* Lucene suggester
* */
@SuppressWarnings({"unchecked"})
public String init(NamedList<?> config, SolrCore core) {
log.info("init: {}", config);
// read the config
name = config.get(NAME) != null ? (String) config.get(NAME)
: DEFAULT_DICT_NAME;
sourceLocation = (String) config.get(LOCATION);
lookupImpl = (String) config.get(LOOKUP_IMPL);
dictionaryImpl = (String) config.get(DICTIONARY_IMPL);
String store = (String)config.get(STORE_DIR);
if (lookupImpl == null) {
lookupImpl = LookupFactory.DEFAULT_FILE_BASED_DICT;
log.info("No {} parameter was provided falling back to {}", LOOKUP_IMPL, lookupImpl);
}
contextFilterQueryAnalyzer = new TokenizerChain(new StandardTokenizerFactory(Collections.EMPTY_MAP), null);
// initialize appropriate lookup instance
factory = core.getResourceLoader().newInstance(lookupImpl, LookupFactory.class);
lookup = factory.create(config, core);
if (lookup != null && lookup instanceof Closeable) {
core.addCloseHook(new CloseHook() {
@Override
public void preClose(SolrCore core) {
try {
((Closeable) lookup).close();
} catch (IOException e) {
log.warn("Could not close the suggester lookup.", e);
}
}
@Override
public void postClose(SolrCore core) {
}
});
}
// if store directory is provided make it or load up the lookup with its content
if (store != null && !store.isEmpty()) {
storeDir = new File(store);
if (!storeDir.isAbsolute()) {
storeDir = new File(core.getDataDir() + File.separator + storeDir);
}
if (!storeDir.exists()) {
storeDir.mkdirs();
} else if (getStoreFile().exists()) {
if (log.isDebugEnabled()) {
log.debug("attempt reload of the stored lookup from file {}", getStoreFile());
}
try {
lookup.load(new FileInputStream(getStoreFile()));
} catch (IOException e) {
log.warn("Loading stored lookup data failed, possibly not cached yet");
}
}
}
// dictionary configuration
if (dictionaryImpl == null) {
dictionaryImpl = (sourceLocation == null) ? DictionaryFactory.DEFAULT_INDEX_BASED_DICT :
DictionaryFactory.DEFAULT_FILE_BASED_DICT;
log.info("No {} parameter was provided falling back to {}", DICTIONARY_IMPL, dictionaryImpl);
}
dictionaryFactory = core.getResourceLoader().newInstance(dictionaryImpl, DictionaryFactory.class);
dictionaryFactory.setParams(config);
log.info("Dictionary loaded with params: {}", config);
return name;
}
/** Build the underlying Lucene Suggester */
public void build(SolrCore core, SolrIndexSearcher searcher) throws IOException {
log.info("MySolrSuggester.build({})", name);