From 06a5cd5aef4ce1bc3a29098a061428cbe97f09b1 Mon Sep 17 00:00:00 2001 From: Fabian Schlager Date: Thu, 13 Nov 2014 22:07:14 +0100 Subject: [PATCH 1/7] Initial version of whoosh based search --- realms/config/__init__.py | 9 +++-- realms/modules/search/models.py | 63 +++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/realms/config/__init__.py b/realms/config/__init__.py index d686686..aae052a 100644 --- a/realms/config/__init__.py +++ b/realms/config/__init__.py @@ -86,20 +86,23 @@ DB_URI = 'sqlite:////tmp/wiki.db' CACHE_TYPE = 'simple' # Redis -#CACHE_TYPE = 'redis' +# CACHE_TYPE = 'redis' CACHE_REDIS_HOST = '127.0.0.1' CACHE_REDIS_PORT = 6379 CACHE_REDIS_DB = '0' # Memcached -#CACHE_TYPE = 'memcached' +# CACHE_TYPE = 'memcached' CACHE_MEMCACHED_SERVERS = ['127.0.0.1:11211'] -SEARCH_TYPE = 'simple' # simple is not good for large wikis +# SEARCH_TYPE = 'simple' # simple is not good for large wikis # SEARCH_TYPE = 'elasticsearch' ELASTICSEARCH_URL = 'http://127.0.0.1:9200' +SEARCH_TYPE = 'whoosh' +WHOOSH_INDEX = '/tmp/whoosh' + # Get ReCaptcha Keys for your domain here: # https://www.google.com/recaptcha/admin#whyrecaptcha RECAPTCHA_ENABLE = False diff --git a/realms/modules/search/models.py b/realms/modules/search/models.py index a9f019c..87dbedc 100644 --- a/realms/modules/search/models.py +++ b/realms/modules/search/models.py @@ -6,6 +6,10 @@ def simple(app): return SimpleSearch() +def whoosh(app): + return WhooshSearch(app.config['WHOOSH_INDEX']) + + def elasticsearch(app): from flask.ext.elastic import Elastic return ElasticSearch(Elastic(app)) @@ -42,6 +46,65 @@ class SimpleSearch(BaseSearch): pass +class WhooshSearch(BaseSearch): + def __init__(self, index_path): + from whoosh import index as whoosh_index + from whoosh.fields import Schema, TEXT, ID + from whoosh import qparser + from whoosh.highlight import UppercaseFormatter + from whoosh.analysis import LanguageAnalyzer + import os.path + + self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=LanguageAnalyzer("de"))) + self.formatter = UppercaseFormatter() + + if os.path.exists(index_path): + self.search_index = whoosh_index.open_dir(index_path) + else: + os.mkdir(index_path) + self.search_index = whoosh_index.create_in(index_path, self.schema) + + self.query_parser = qparser.QueryParser("body", schema=self.schema) + self.query_parser.add_plugin(qparser.FuzzyTermPlugin()) + + def index(self, index, doc_type, id_=None, body=None): + writer = self.search_index.writer() + writer.update_document(path=id_.decode("utf-8"), body=body["content"]) + writer.commit() + + def index_wiki(self, name, body): + self.index('wiki', 'page', id_=name, body=body) + + def delete_index(self, index): + writer = self.search_index.writer() + writer.delete_by_term('path', index) + writer.commit() + + def wiki(self, query): + if not query: + return [] + + q = self.query_parser.parse("%s~2" % (query,)) + + with self.search_index.searcher() as s: + results = s.search(q) + + results.formatter = self.formatter + + res = [] + for hit in results: + name = hit["path"] + page_data = g.current_wiki.get_page(name)["data"].decode("utf-8") + content = hit.highlights('body', text=page_data) + + res.append(dict(name=name, content=content)) + + return res + + def users(self, query): + pass + + class ElasticSearch(BaseSearch): def __init__(self, elastic): self.elastic = elastic From db1f5c84a73c4b64c2b3214ef7daaacc1a2ec938 Mon Sep 17 00:00:00 2001 From: Fabian Schlager Date: Fri, 21 Nov 2014 15:40:40 +0100 Subject: [PATCH 2/7] Add fallback for non-supported languages & search in path and text --- realms/config/__init__.py | 1 + realms/modules/search/models.py | 26 +++++++++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/realms/config/__init__.py b/realms/config/__init__.py index aae052a..921e26c 100644 --- a/realms/config/__init__.py +++ b/realms/config/__init__.py @@ -102,6 +102,7 @@ ELASTICSEARCH_URL = 'http://127.0.0.1:9200' SEARCH_TYPE = 'whoosh' WHOOSH_INDEX = '/tmp/whoosh' +WHOOSH_LANGUAGE = 'en' # Get ReCaptcha Keys for your domain here: # https://www.google.com/recaptcha/admin#whyrecaptcha diff --git a/realms/modules/search/models.py b/realms/modules/search/models.py index 87dbedc..231389b 100644 --- a/realms/modules/search/models.py +++ b/realms/modules/search/models.py @@ -7,7 +7,7 @@ def simple(app): def whoosh(app): - return WhooshSearch(app.config['WHOOSH_INDEX']) + return WhooshSearch(app.config['WHOOSH_INDEX'], app.config['WHOOSH_LANGUAGE']) def elasticsearch(app): @@ -47,38 +47,46 @@ class SimpleSearch(BaseSearch): class WhooshSearch(BaseSearch): - def __init__(self, index_path): + def __init__(self, index_path, language): from whoosh import index as whoosh_index from whoosh.fields import Schema, TEXT, ID from whoosh import qparser from whoosh.highlight import UppercaseFormatter - from whoosh.analysis import LanguageAnalyzer + from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer + from whoosh.lang import has_stemmer, has_stopwords import os.path - self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=LanguageAnalyzer("de"))) + if not has_stemmer(language) or not has_stopwords(language): + print("Language '%s' not supported by Whoosh, falling back to default analyzer." % (language)) + analyzer = SimpleAnalyzer() + else: + analyzer = LanguageAnalyzer(language) + + self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer)) self.formatter = UppercaseFormatter() + self.index_path = index_path if os.path.exists(index_path): self.search_index = whoosh_index.open_dir(index_path) else: os.mkdir(index_path) self.search_index = whoosh_index.create_in(index_path, self.schema) - self.query_parser = qparser.QueryParser("body", schema=self.schema) + self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema) self.query_parser.add_plugin(qparser.FuzzyTermPlugin()) def index(self, index, doc_type, id_=None, body=None): writer = self.search_index.writer() - writer.update_document(path=id_.decode("utf-8"), body=body["content"]) + writer.update_document(path=id_.decode("utf-8"), body=body["content"].decode("utf-8")) writer.commit() def index_wiki(self, name, body): self.index('wiki', 'page', id_=name, body=body) def delete_index(self, index): - writer = self.search_index.writer() - writer.delete_by_term('path', index) - writer.commit() + from whoosh import index as whoosh_index + self.search_index.close() + self.search_index = whoosh_index.create_in(self.index_path, schema=self.schema) def wiki(self, query): if not query: From e1aa96ea5fd36e778b0609538ba27d21467d1d70 Mon Sep 17 00:00:00 2001 From: Fabian Schlager Date: Fri, 21 Nov 2014 16:30:13 +0100 Subject: [PATCH 3/7] Check if WHOOSH_INDEX path is read & writeable --- realms/modules/search/models.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/realms/modules/search/models.py b/realms/modules/search/models.py index 231389b..38a40c0 100644 --- a/realms/modules/search/models.py +++ b/realms/modules/search/models.py @@ -7,6 +7,13 @@ def simple(app): def whoosh(app): + import os + import sys + for mode in [os.W_OK, os.R_OK]: + if not os.access(app.config['WHOOSH_INDEX'], mode): + sys.exit('Read and write access to WHOOSH_INDEX is required (%s)' % + app.config['WHOOSH_INDEX']) + return WhooshSearch(app.config['WHOOSH_INDEX'], app.config['WHOOSH_LANGUAGE']) @@ -57,7 +64,7 @@ class WhooshSearch(BaseSearch): import os.path if not has_stemmer(language) or not has_stopwords(language): - print("Language '%s' not supported by Whoosh, falling back to default analyzer." % (language)) + # TODO Display a warning? analyzer = SimpleAnalyzer() else: analyzer = LanguageAnalyzer(language) From 6ec3ff6acbfcf53bef4b79e6e9d8c3b686aa2060 Mon Sep 17 00:00:00 2001 From: Fabian Schlager Date: Fri, 21 Nov 2014 16:45:20 +0100 Subject: [PATCH 4/7] Remove fuzzy search, needs some tweaking probably --- realms/modules/search/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/realms/modules/search/models.py b/realms/modules/search/models.py index 38a40c0..2712236 100644 --- a/realms/modules/search/models.py +++ b/realms/modules/search/models.py @@ -99,7 +99,7 @@ class WhooshSearch(BaseSearch): if not query: return [] - q = self.query_parser.parse("%s~2" % (query,)) + q = self.query_parser.parse(query) with self.search_index.searcher() as s: results = s.search(q) From d8e7ed7ad47b92a6c097a0864f52d92f2440de17 Mon Sep 17 00:00:00 2001 From: Fabian Schlager Date: Fri, 21 Nov 2014 16:49:36 +0100 Subject: [PATCH 5/7] Restored default config --- realms/config/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/realms/config/__init__.py b/realms/config/__init__.py index 921e26c..40533b9 100644 --- a/realms/config/__init__.py +++ b/realms/config/__init__.py @@ -95,12 +95,12 @@ CACHE_REDIS_DB = '0' # CACHE_TYPE = 'memcached' CACHE_MEMCACHED_SERVERS = ['127.0.0.1:11211'] -# SEARCH_TYPE = 'simple' # simple is not good for large wikis +SEARCH_TYPE = 'simple' # simple is not good for large wikis # SEARCH_TYPE = 'elasticsearch' ELASTICSEARCH_URL = 'http://127.0.0.1:9200' -SEARCH_TYPE = 'whoosh' +# SEARCH_TYPE = 'whoosh' WHOOSH_INDEX = '/tmp/whoosh' WHOOSH_LANGUAGE = 'en' From a8f61dfb13079ea807a4ec3f865cb1d7047d39d8 Mon Sep 17 00:00:00 2001 From: Fabian Schlager Date: Fri, 21 Nov 2014 17:11:27 +0100 Subject: [PATCH 6/7] Clean up & fix index creation, handling exceptions --- realms/modules/search/models.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/realms/modules/search/models.py b/realms/modules/search/models.py index 2712236..8950d4d 100644 --- a/realms/modules/search/models.py +++ b/realms/modules/search/models.py @@ -1,3 +1,5 @@ +import sys + from flask import g, current_app from realms.lib.util import filename_to_cname @@ -7,13 +9,6 @@ def simple(app): def whoosh(app): - import os - import sys - for mode in [os.W_OK, os.R_OK]: - if not os.access(app.config['WHOOSH_INDEX'], mode): - sys.exit('Read and write access to WHOOSH_INDEX is required (%s)' % - app.config['WHOOSH_INDEX']) - return WhooshSearch(app.config['WHOOSH_INDEX'], app.config['WHOOSH_LANGUAGE']) @@ -61,7 +56,7 @@ class WhooshSearch(BaseSearch): from whoosh.highlight import UppercaseFormatter from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer from whoosh.lang import has_stemmer, has_stopwords - import os.path + import os if not has_stemmer(language) or not has_stopwords(language): # TODO Display a warning? @@ -73,10 +68,19 @@ class WhooshSearch(BaseSearch): self.formatter = UppercaseFormatter() self.index_path = index_path - if os.path.exists(index_path): - self.search_index = whoosh_index.open_dir(index_path) + + if not os.path.exists(index_path): + try: + os.mkdir(index_path) + except OSError as e: + sys.exit("Error creating Whoosh index: %s" % e) + + if whoosh_index.exists_in(index_path): + try: + self.search_index = whoosh_index.open_dir(index_path) + except whoosh_index.IndexError as e: + sys.exit("Error opening whoosh index: %s" % (e)) else: - os.mkdir(index_path) self.search_index = whoosh_index.create_in(index_path, self.schema) self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema) From b38b4238f39c33682f6ebd347b86b2ed8c63771e Mon Sep 17 00:00:00 2001 From: Fabian Schlager Date: Fri, 21 Nov 2014 17:24:49 +0100 Subject: [PATCH 7/7] Add information about Whoosh to README --- README.md | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2c399b3..23511f3 100644 --- a/README.md +++ b/README.md @@ -214,8 +214,8 @@ _Don't forget to create your database._ ## Search Realms wiki comes with basic search capabilities but it is not recommended -for large wikis or if you require more advanced search capabilities. The only -backend we currently support is ElasticSearch +for large wikis or if you require more advanced search capabilities. The +backends we currently support are ElasticSearch and Whoosh. ### Elasticsearch Setup @@ -239,6 +239,25 @@ In your Realms Config, have the following options set: "SEARCH_TYPE": "elasticsearch" "ELASTICSEARCH_URL": "http://127.0.0.1:9200" +### Whoosh Setup + +Simply install Whoosh to your Python environment, e.g. + + pip install Whoosh + +**Configuring Whoosh** + +To use Whoosh, set the following in your Realms config: + + "SEARCH_TYPE": "whoosh" + "WHOOSH_INDEX": "/path/to/your/whoosh/index" + "WHOOSH_LANGUAGE": "en" + +WHOOSH_INDEX has to be a path read- and writeable by Realm's user. It will be created automatically if it doesn't exist. + +Whoosh is set up to use language optimization, so set WHOOSH_LANGUAGE to the language used in your wiki. For available languages, check whoosh.lang.languages. +If your language is not supported, Realms will fall back to a simple text analyzer. + ## Running realms-wiki start