Created
March 5, 2025 16:28
-
-
Save greggdonovan/07e2656821454ee80a3b502ea39077ca to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* When using Vespa with Lucene linguistics, the query-time analysis happens automatically when you use the YQL | |
| query language, but in our Java API example, we need to explicitly apply the same analysis chain that's used | |
| during indexing. | |
| Here's the updated Java API searcher that properly applies Lucene analysis: | |
| */ | |
| import com.yahoo.component.chain.Chain; | |
| import com.yahoo.language.Linguistics; | |
| import com.yahoo.language.process.StemMode; | |
| import com.yahoo.language.process.Token; | |
| import com.yahoo.language.process.TokenType; | |
| import com.yahoo.language.process.Tokenizer; | |
| import com.yahoo.prelude.query.*; | |
| import com.yahoo.search.Query; | |
| import com.yahoo.search.searchchain.Execution; | |
| import com.yahoo.search.Searcher; | |
| import com.yahoo.search.Result; | |
| import java.util.List; | |
| public class DisMaxSearcher extends Searcher { | |
| private final Linguistics linguistics; | |
| // Inject the Linguistics component | |
| public DisMaxSearcher(Linguistics linguistics) { | |
| this.linguistics = linguistics; | |
| } | |
| @Override | |
| public Result search(Query query, Execution execution) { | |
| // Get the user query string | |
| String userQuery = query.getModel().getQueryString(); | |
| if (userQuery == null || userQuery.trim().isEmpty()) { | |
| return execution.search(query); | |
| } | |
| // Create an AND item for the query | |
| AndItem andItem = new AndItem(); | |
| // Use Lucene linguistics to tokenize and process the query | |
| Tokenizer tokenizer = linguistics.getTokenizer(); | |
| List<Token> tokens = tokenizer.tokenize(userQuery, linguistics.getLanguage(), | |
| StemMode.BEST, true); | |
| // Process each token from the analyzer | |
| for (Token token : tokens) { | |
| // Skip non-words (like punctuation) | |
| if (token.getType() != TokenType.ALPHABETIC && | |
| token.getType() != TokenType.NUMERIC) { | |
| continue; | |
| } | |
| // Use the stemmed form if available | |
| String term = token.getTokenString(); | |
| if (token.getStemmedForm() != null && !token.getStemmedForm().isEmpty()) { | |
| term = token.getStemmedForm(); | |
| } | |
| // Create OR across fields with weights for this term | |
| OrItem orItem = new OrItem(); | |
| // Add each field with appropriate weight | |
| orItem.addItem(new WordItem(term, "title", true)); | |
| orItem.addItem(new WordItem(term, "body", true)); | |
| orItem.addItem(new WordItem(term, "keywords", true)); | |
| // Add the OR item to the AND item | |
| andItem.addItem(orItem); | |
| } | |
| // Replace the query tree with our custom tree | |
| query.getModel().getQueryTree().setRoot(andItem); | |
| // Set the default ranking profile | |
| query.getRanking().setProfile("default"); | |
| // Execute the search | |
| return execution.search(query); | |
| } | |
| } | |
| /* | |
| Key differences: | |
| 1. Linguistics Injection: We inject the Linguistics component which provides access to all Lucene analysis | |
| components. | |
| 2. Tokenization and Stemming: Instead of manually splitting on spaces, we use Vespa's Lucene-based tokenizer to: | |
| - Properly tokenize the query text | |
| - Apply language-specific processing | |
| - Get stemmed forms of tokens | |
| - Filter out stopwords (handled by the tokenizer) | |
| 3. Token Processing: We skip non-word tokens and use stemmed forms when available. | |
| This approach ensures that the same Lucene analysis chain (stemming, stopwords, etc.) that's applied during | |
| indexing is also consistently applied at query time, creating a proper DisMax-like behavior where both document | |
| and query undergo the same linguistic processing | |
| */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment