Merge pull request #40 from ghtyrant/master

Add Whoosh as an alternate search backend
This commit is contained in:
Matthew Scragg 2014-11-23 18:33:24 -06:00
commit 334a9be4d0
3 changed files with 109 additions and 4 deletions

View file

@ -214,8 +214,8 @@ _Don't forget to create your database._
## Search ## Search
Realms wiki comes with basic search capabilities but it is not recommended Realms wiki comes with basic search capabilities but it is not recommended
for large wikis or if you require more advanced search capabilities. The only for large wikis or if you require more advanced search capabilities. The
backend we currently support is ElasticSearch backends we currently support are ElasticSearch and Whoosh.
### Elasticsearch Setup ### Elasticsearch Setup
@ -239,6 +239,25 @@ In your Realms Config, have the following options set:
"SEARCH_TYPE": "elasticsearch" "SEARCH_TYPE": "elasticsearch"
"ELASTICSEARCH_URL": "http://127.0.0.1:9200" "ELASTICSEARCH_URL": "http://127.0.0.1:9200"
### Whoosh Setup
Simply install Whoosh to your Python environment, e.g.
pip install Whoosh
**Configuring Whoosh**
To use Whoosh, set the following in your Realms config:
"SEARCH_TYPE": "whoosh"
"WHOOSH_INDEX": "/path/to/your/whoosh/index"
"WHOOSH_LANGUAGE": "en"
WHOOSH_INDEX has to be a path read- and writeable by Realm's user. It will be created automatically if it doesn't exist.
Whoosh is set up to use language optimization, so set WHOOSH_LANGUAGE to the language used in your wiki. For available languages, check whoosh.lang.languages.
If your language is not supported, Realms will fall back to a simple text analyzer.
## Running ## Running
realms-wiki start realms-wiki start

View file

@ -100,6 +100,10 @@ SEARCH_TYPE = 'simple' # simple is not good for large wikis
# SEARCH_TYPE = 'elasticsearch' # SEARCH_TYPE = 'elasticsearch'
ELASTICSEARCH_URL = 'http://127.0.0.1:9200' ELASTICSEARCH_URL = 'http://127.0.0.1:9200'
# SEARCH_TYPE = 'whoosh'
WHOOSH_INDEX = '/tmp/whoosh'
WHOOSH_LANGUAGE = 'en'
# Get ReCaptcha Keys for your domain here: # Get ReCaptcha Keys for your domain here:
# https://www.google.com/recaptcha/admin#whyrecaptcha # https://www.google.com/recaptcha/admin#whyrecaptcha
RECAPTCHA_ENABLE = False RECAPTCHA_ENABLE = False

View file

@ -1,3 +1,5 @@
import sys
from flask import g, current_app from flask import g, current_app
from realms.lib.util import filename_to_cname from realms.lib.util import filename_to_cname
@ -6,6 +8,10 @@ def simple(app):
return SimpleSearch() return SimpleSearch()
def whoosh(app):
return WhooshSearch(app.config['WHOOSH_INDEX'], app.config['WHOOSH_LANGUAGE'])
def elasticsearch(app): def elasticsearch(app):
from flask.ext.elastic import Elastic from flask.ext.elastic import Elastic
return ElasticSearch(Elastic(app)) return ElasticSearch(Elastic(app))
@ -42,6 +48,82 @@ class SimpleSearch(BaseSearch):
pass pass
class WhooshSearch(BaseSearch):
def __init__(self, index_path, language):
from whoosh import index as whoosh_index
from whoosh.fields import Schema, TEXT, ID
from whoosh import qparser
from whoosh.highlight import UppercaseFormatter
from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer
from whoosh.lang import has_stemmer, has_stopwords
import os
if not has_stemmer(language) or not has_stopwords(language):
# TODO Display a warning?
analyzer = SimpleAnalyzer()
else:
analyzer = LanguageAnalyzer(language)
self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer))
self.formatter = UppercaseFormatter()
self.index_path = index_path
if not os.path.exists(index_path):
try:
os.mkdir(index_path)
except OSError as e:
sys.exit("Error creating Whoosh index: %s" % e)
if whoosh_index.exists_in(index_path):
try:
self.search_index = whoosh_index.open_dir(index_path)
except whoosh_index.IndexError as e:
sys.exit("Error opening whoosh index: %s" % (e))
else:
self.search_index = whoosh_index.create_in(index_path, self.schema)
self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema)
self.query_parser.add_plugin(qparser.FuzzyTermPlugin())
def index(self, index, doc_type, id_=None, body=None):
writer = self.search_index.writer()
writer.update_document(path=id_.decode("utf-8"), body=body["content"].decode("utf-8"))
writer.commit()
def index_wiki(self, name, body):
self.index('wiki', 'page', id_=name, body=body)
def delete_index(self, index):
from whoosh import index as whoosh_index
self.search_index.close()
self.search_index = whoosh_index.create_in(self.index_path, schema=self.schema)
def wiki(self, query):
if not query:
return []
q = self.query_parser.parse(query)
with self.search_index.searcher() as s:
results = s.search(q)
results.formatter = self.formatter
res = []
for hit in results:
name = hit["path"]
page_data = g.current_wiki.get_page(name)["data"].decode("utf-8")
content = hit.highlights('body', text=page_data)
res.append(dict(name=name, content=content))
return res
def users(self, query):
pass
class ElasticSearch(BaseSearch): class ElasticSearch(BaseSearch):
def __init__(self, elastic): def __init__(self, elastic):
self.elastic = elastic self.elastic = elastic