From db1f5c84a73c4b64c2b3214ef7daaacc1a2ec938 Mon Sep 17 00:00:00 2001 From: Fabian Schlager Date: Fri, 21 Nov 2014 15:40:40 +0100 Subject: [PATCH] Add fallback for non-supported languages & search in path and text --- realms/config/__init__.py | 1 + realms/modules/search/models.py | 26 +++++++++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/realms/config/__init__.py b/realms/config/__init__.py index aae052a..921e26c 100644 --- a/realms/config/__init__.py +++ b/realms/config/__init__.py @@ -102,6 +102,7 @@ ELASTICSEARCH_URL = 'http://127.0.0.1:9200' SEARCH_TYPE = 'whoosh' WHOOSH_INDEX = '/tmp/whoosh' +WHOOSH_LANGUAGE = 'en' # Get ReCaptcha Keys for your domain here: # https://www.google.com/recaptcha/admin#whyrecaptcha diff --git a/realms/modules/search/models.py b/realms/modules/search/models.py index 87dbedc..231389b 100644 --- a/realms/modules/search/models.py +++ b/realms/modules/search/models.py @@ -7,7 +7,7 @@ def simple(app): def whoosh(app): - return WhooshSearch(app.config['WHOOSH_INDEX']) + return WhooshSearch(app.config['WHOOSH_INDEX'], app.config['WHOOSH_LANGUAGE']) def elasticsearch(app): @@ -47,38 +47,46 @@ class SimpleSearch(BaseSearch): class WhooshSearch(BaseSearch): - def __init__(self, index_path): + def __init__(self, index_path, language): from whoosh import index as whoosh_index from whoosh.fields import Schema, TEXT, ID from whoosh import qparser from whoosh.highlight import UppercaseFormatter - from whoosh.analysis import LanguageAnalyzer + from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer + from whoosh.lang import has_stemmer, has_stopwords import os.path - self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=LanguageAnalyzer("de"))) + if not has_stemmer(language) or not has_stopwords(language): + print("Language '%s' not supported by Whoosh, falling back to default analyzer." % (language)) + analyzer = SimpleAnalyzer() + else: + analyzer = LanguageAnalyzer(language) + + self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer)) self.formatter = UppercaseFormatter() + self.index_path = index_path if os.path.exists(index_path): self.search_index = whoosh_index.open_dir(index_path) else: os.mkdir(index_path) self.search_index = whoosh_index.create_in(index_path, self.schema) - self.query_parser = qparser.QueryParser("body", schema=self.schema) + self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema) self.query_parser.add_plugin(qparser.FuzzyTermPlugin()) def index(self, index, doc_type, id_=None, body=None): writer = self.search_index.writer() - writer.update_document(path=id_.decode("utf-8"), body=body["content"]) + writer.update_document(path=id_.decode("utf-8"), body=body["content"].decode("utf-8")) writer.commit() def index_wiki(self, name, body): self.index('wiki', 'page', id_=name, body=body) def delete_index(self, index): - writer = self.search_index.writer() - writer.delete_by_term('path', index) - writer.commit() + from whoosh import index as whoosh_index + self.search_index.close() + self.search_index = whoosh_index.create_in(self.index_path, schema=self.schema) def wiki(self, query): if not query: