diff --git a/README.md b/README.md index 2c399b3..23511f3 100644 --- a/README.md +++ b/README.md @@ -214,8 +214,8 @@ _Don't forget to create your database._ ## Search Realms wiki comes with basic search capabilities but it is not recommended -for large wikis or if you require more advanced search capabilities. The only -backend we currently support is ElasticSearch +for large wikis or if you require more advanced search capabilities. The +backends we currently support are ElasticSearch and Whoosh. ### Elasticsearch Setup @@ -239,6 +239,25 @@ In your Realms Config, have the following options set: "SEARCH_TYPE": "elasticsearch" "ELASTICSEARCH_URL": "http://127.0.0.1:9200" +### Whoosh Setup + +Simply install Whoosh to your Python environment, e.g. + + pip install Whoosh + +**Configuring Whoosh** + +To use Whoosh, set the following in your Realms config: + + "SEARCH_TYPE": "whoosh" + "WHOOSH_INDEX": "/path/to/your/whoosh/index" + "WHOOSH_LANGUAGE": "en" + +WHOOSH_INDEX has to be a path read- and writeable by Realm's user. It will be created automatically if it doesn't exist. + +Whoosh is set up to use language optimization, so set WHOOSH_LANGUAGE to the language used in your wiki. For available languages, check whoosh.lang.languages. +If your language is not supported, Realms will fall back to a simple text analyzer. + ## Running realms-wiki start diff --git a/realms/config/__init__.py b/realms/config/__init__.py index d686686..40533b9 100644 --- a/realms/config/__init__.py +++ b/realms/config/__init__.py @@ -86,13 +86,13 @@ DB_URI = 'sqlite:////tmp/wiki.db' CACHE_TYPE = 'simple' # Redis -#CACHE_TYPE = 'redis' +# CACHE_TYPE = 'redis' CACHE_REDIS_HOST = '127.0.0.1' CACHE_REDIS_PORT = 6379 CACHE_REDIS_DB = '0' # Memcached -#CACHE_TYPE = 'memcached' +# CACHE_TYPE = 'memcached' CACHE_MEMCACHED_SERVERS = ['127.0.0.1:11211'] SEARCH_TYPE = 'simple' # simple is not good for large wikis @@ -100,6 +100,10 @@ SEARCH_TYPE = 'simple' # simple is not good for large wikis # SEARCH_TYPE = 'elasticsearch' ELASTICSEARCH_URL = 'http://127.0.0.1:9200' +# SEARCH_TYPE = 'whoosh' +WHOOSH_INDEX = '/tmp/whoosh' +WHOOSH_LANGUAGE = 'en' + # Get ReCaptcha Keys for your domain here: # https://www.google.com/recaptcha/admin#whyrecaptcha RECAPTCHA_ENABLE = False diff --git a/realms/modules/search/models.py b/realms/modules/search/models.py index a9f019c..8950d4d 100644 --- a/realms/modules/search/models.py +++ b/realms/modules/search/models.py @@ -1,3 +1,5 @@ +import sys + from flask import g, current_app from realms.lib.util import filename_to_cname @@ -6,6 +8,10 @@ def simple(app): return SimpleSearch() +def whoosh(app): + return WhooshSearch(app.config['WHOOSH_INDEX'], app.config['WHOOSH_LANGUAGE']) + + def elasticsearch(app): from flask.ext.elastic import Elastic return ElasticSearch(Elastic(app)) @@ -42,6 +48,82 @@ class SimpleSearch(BaseSearch): pass +class WhooshSearch(BaseSearch): + def __init__(self, index_path, language): + from whoosh import index as whoosh_index + from whoosh.fields import Schema, TEXT, ID + from whoosh import qparser + from whoosh.highlight import UppercaseFormatter + from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer + from whoosh.lang import has_stemmer, has_stopwords + import os + + if not has_stemmer(language) or not has_stopwords(language): + # TODO Display a warning? + analyzer = SimpleAnalyzer() + else: + analyzer = LanguageAnalyzer(language) + + self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer)) + self.formatter = UppercaseFormatter() + + self.index_path = index_path + + if not os.path.exists(index_path): + try: + os.mkdir(index_path) + except OSError as e: + sys.exit("Error creating Whoosh index: %s" % e) + + if whoosh_index.exists_in(index_path): + try: + self.search_index = whoosh_index.open_dir(index_path) + except whoosh_index.IndexError as e: + sys.exit("Error opening whoosh index: %s" % (e)) + else: + self.search_index = whoosh_index.create_in(index_path, self.schema) + + self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema) + self.query_parser.add_plugin(qparser.FuzzyTermPlugin()) + + def index(self, index, doc_type, id_=None, body=None): + writer = self.search_index.writer() + writer.update_document(path=id_.decode("utf-8"), body=body["content"].decode("utf-8")) + writer.commit() + + def index_wiki(self, name, body): + self.index('wiki', 'page', id_=name, body=body) + + def delete_index(self, index): + from whoosh import index as whoosh_index + self.search_index.close() + self.search_index = whoosh_index.create_in(self.index_path, schema=self.schema) + + def wiki(self, query): + if not query: + return [] + + q = self.query_parser.parse(query) + + with self.search_index.searcher() as s: + results = s.search(q) + + results.formatter = self.formatter + + res = [] + for hit in results: + name = hit["path"] + page_data = g.current_wiki.get_page(name)["data"].decode("utf-8") + content = hit.highlights('body', text=page_data) + + res.append(dict(name=name, content=content)) + + return res + + def users(self, query): + pass + + class ElasticSearch(BaseSearch): def __init__(self, elastic): self.elastic = elastic