greggdonovan · March 5, 2025 16:28
diff --git a/InJavaLinguistics.java b/InJavaLinguistics.java
 /* When using Vespa with Lucene linguistics, the query-time analysis happens automatically when you use the YQL
  query language, but in our Java API example, we need to explicitly apply the same analysis chain that's used
  during indexing.

  Here's the updated Java API searcher that properly applies Lucene analysis:
 */
  import com.yahoo.component.chain.Chain;
  import com.yahoo.language.Linguistics;
  import com.yahoo.language.process.StemMode;
  import com.yahoo.language.process.Token;
  import com.yahoo.language.process.TokenType;
  import com.yahoo.language.process.Tokenizer;
  import com.yahoo.prelude.query.*;
  import com.yahoo.search.Query;
  import com.yahoo.search.searchchain.Execution;
  import com.yahoo.search.Searcher;
  import com.yahoo.search.Result;

  import java.util.List;

  public class DisMaxSearcher extends Searcher {

      private final Linguistics linguistics;

      // Inject the Linguistics component
      public DisMaxSearcher(Linguistics linguistics) {
          this.linguistics = linguistics;
      }

      @Override
      public Result search(Query query, Execution execution) {
          // Get the user query string
          String userQuery = query.getModel().getQueryString();
          if (userQuery == null || userQuery.trim().isEmpty()) {
              return execution.search(query);
          }

          // Create an AND item for the query
          AndItem andItem = new AndItem();

          // Use Lucene linguistics to tokenize and process the query
          Tokenizer tokenizer = linguistics.getTokenizer();
          List<Token> tokens = tokenizer.tokenize(userQuery, linguistics.getLanguage(),
                                                 StemMode.BEST, true);

          // Process each token from the analyzer
          for (Token token : tokens) {
              // Skip non-words (like punctuation)
              if (token.getType() != TokenType.ALPHABETIC &&
                  token.getType() != TokenType.NUMERIC) {
                  continue;
              }

              // Use the stemmed form if available
              String term = token.getTokenString();
              if (token.getStemmedForm() != null && !token.getStemmedForm().isEmpty()) {
                  term = token.getStemmedForm();
              }

              // Create OR across fields with weights for this term
              OrItem orItem = new OrItem();

              // Add each field with appropriate weight
              orItem.addItem(new WordItem(term, "title", true));
              orItem.addItem(new WordItem(term, "body", true));
              orItem.addItem(new WordItem(term, "keywords", true));

              // Add the OR item to the AND item
              andItem.addItem(orItem);
          }

          // Replace the query tree with our custom tree
          query.getModel().getQueryTree().setRoot(andItem);

          // Set the default ranking profile
          query.getRanking().setProfile("default");

          // Execute the search
          return execution.search(query);
      }
  }
 /*
  Key differences:

  1. Linguistics Injection: We inject the Linguistics component which provides access to all Lucene analysis
  components.
  2. Tokenization and Stemming: Instead of manually splitting on spaces, we use Vespa's Lucene-based tokenizer to:
    - Properly tokenize the query text
    - Apply language-specific processing
    - Get stemmed forms of tokens
    - Filter out stopwords (handled by the tokenizer)
  3. Token Processing: We skip non-word tokens and use stemmed forms when available.

  This approach ensures that the same Lucene analysis chain (stemming, stopwords, etc.) that's applied during
  indexing is also consistently applied at query time, creating a proper DisMax-like behavior where both document
  and query undergo the same linguistic processing 
  */
	/* When using Vespa with Lucene linguistics, the query-time analysis happens automatically when you use the YQL
	query language, but in our Java API example, we need to explicitly apply the same analysis chain that's used
	during indexing.

	Here's the updated Java API searcher that properly applies Lucene analysis:
	*/
	import com.yahoo.component.chain.Chain;
	import com.yahoo.language.Linguistics;
	import com.yahoo.language.process.StemMode;
	import com.yahoo.language.process.Token;
	import com.yahoo.language.process.TokenType;
	import com.yahoo.language.process.Tokenizer;
	import com.yahoo.prelude.query.*;
	import com.yahoo.search.Query;
	import com.yahoo.search.searchchain.Execution;
	import com.yahoo.search.Searcher;
	import com.yahoo.search.Result;

	import java.util.List;

	public class DisMaxSearcher extends Searcher {

	private final Linguistics linguistics;

	// Inject the Linguistics component
	public DisMaxSearcher(Linguistics linguistics) {
	this.linguistics = linguistics;
	}

	@Override
	public Result search(Query query, Execution execution) {
	// Get the user query string
	String userQuery = query.getModel().getQueryString();
	if (userQuery == null \|\| userQuery.trim().isEmpty()) {
	return execution.search(query);
	}

	// Create an AND item for the query
	AndItem andItem = new AndItem();

	// Use Lucene linguistics to tokenize and process the query
	Tokenizer tokenizer = linguistics.getTokenizer();
	List<Token> tokens = tokenizer.tokenize(userQuery, linguistics.getLanguage(),
	StemMode.BEST, true);

	// Process each token from the analyzer
	for (Token token : tokens) {
	// Skip non-words (like punctuation)
	if (token.getType() != TokenType.ALPHABETIC &&
	token.getType() != TokenType.NUMERIC) {
	continue;
	}

	// Use the stemmed form if available
	String term = token.getTokenString();
	if (token.getStemmedForm() != null && !token.getStemmedForm().isEmpty()) {
	term = token.getStemmedForm();
	}

	// Create OR across fields with weights for this term
	OrItem orItem = new OrItem();

	// Add each field with appropriate weight
	orItem.addItem(new WordItem(term, "title", true));
	orItem.addItem(new WordItem(term, "body", true));
	orItem.addItem(new WordItem(term, "keywords", true));

	// Add the OR item to the AND item
	andItem.addItem(orItem);
	}

	// Replace the query tree with our custom tree
	query.getModel().getQueryTree().setRoot(andItem);

	// Set the default ranking profile
	query.getRanking().setProfile("default");

	// Execute the search
	return execution.search(query);
	}
	}
	/*
	Key differences:

	1. Linguistics Injection: We inject the Linguistics component which provides access to all Lucene analysis
	components.
	2. Tokenization and Stemming: Instead of manually splitting on spaces, we use Vespa's Lucene-based tokenizer to:
	- Properly tokenize the query text
	- Apply language-specific processing
	- Get stemmed forms of tokens
	- Filter out stopwords (handled by the tokenizer)
	3. Token Processing: We skip non-word tokens and use stemmed forms when available.

	This approach ensures that the same Lucene analysis chain (stemming, stopwords, etc.) that's applied during
	indexing is also consistently applied at query time, creating a proper DisMax-like behavior where both document
	and query undergo the same linguistic processing
	*/
No results found