#!/usr/bin/python3

"""
A custom CGI for searching pelican blogs with suitable templates
using xapian.

To create the index:

env BLOG_DIR=path_to_blog blogsearch

Note that the database will be created within the document dir and is
thus world-readable with common setups.  I don't consider that a problem.

Make sure BLOG_DIR is also set in whatever executes the CGI (e.g., using
SetEnv in apache).

To have persistent indexes only updated as necessary, have
DELETE_OUTPUT_DIRECTORY = False in pelican's publishconf.py.  You'll
also need the articlemtime.py plugin for incremental index updates.

See https://blog-g.vo.org/a-local-search-engine-for-pelican-based-blogs-en.html
for more deployment hints.

Dependencies: python3-bs4, python3-xapian, python3-lxml
"""

import cgi
import fnmatch
import functools
import json
import os
import re
import sys

import bs4
import xapian


IS_CGI = "SERVER_SOFTWARE" in os.environ
DB_NAME = ".xapian_db"


################ Micro templating start
def escapePCDATA(val):
	if val is None:
		return ""

	return str(val
		).replace("&", "&amp;"
		).replace('<', '&lt;'
		).replace('>', '&gt;'
		).replace("\0", "&x00;")


def escapeAttrVal(val):
	"""returns val with escapes for double-quoted attribute values.
	"""
	if val is None:
		return '""'
	return escapePCDATA(val).replace('"', '&quot;')


class Template(object):
	"""a *very* basic and ad-hoc template engine.

	It works on HTML strings, with the following constructs expanded:

	* $[key] -- value for key, escaped for double-quoted att values
	* $(key) -- value for key, escaped for PCDATA
	* $|func| -- replace with the value of func(vars)
	* $!raw! -- value for key, non-escaped (other template ops are expanded)
	* $$ -- a $ char.

	Use either unicode strings or plain ASCII
	"""
	def __init__(self, source):
		self.source = str(source)

	def render(self, vars):
		"""returns a string with the template filled using vars.

		vars is a dictionary mapping keys to unicode-able objects.

		You'll get back a unicode string that you must encode before
		spitting it out to the web.
		"""
		return re.sub(r"\$\$", "$",
			re.sub(r"\$\|([a-zA-Z0-9_]+)\|",
				lambda mat: globals()[mat.group(1)](vars),
			re.sub(r"\$\(([a-zA-Z0-9_]+)\)",
				lambda mat: escapePCDATA(vars.get(mat.group(1), "")),
			re.sub(r"\$\[([a-zA-Z0-9_]+)\]",
				lambda mat: escapeAttrVal(vars.get(mat.group(1), "")),
			re.sub(r"\$!([a-zA-ZÄÖÜäöüß0-9_]+)!",
				lambda mat: str(vars.get(mat.group(1), "")),
			self.source)))))

	def serve(self, vars):
		"""emits a basic CGI response for this template.
		"""
		payload = self.render(vars
			).replace("\n", "\r\n").encode("utf-8")

		sys.stdout.buffer.write((
			"content-type: text/html;charset=utf-8\r\n"
			"content-length: %d\r\n\r\n"%len(payload)).encode("ascii"))
		sys.stdout.buffer.write(payload)

################ My templates, pre-compiled

RESP_TEMPLATE = Template("""<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Search...</title>
<link rel="stylesheet" href="/theme/css/style.css"/>
</head>
<body>
<form action="" method="GET"
	style="display:flex;width:100%;flex-flow: row nowrap"
	class="searchform">
<label for="query" style="flex-grow=0">Query: </label>
<input id="query" type="text" name="q" style="flex-grow:1"
	value="$[query]"/>
<input type="submit" value="Search" style="flex-grow: 0"/>
</form>

<ul>
$|list_matches|
</li>
</body>
</html>""")


MATCH_TEMPLATE = Template("""<li xmlns="http://www.w3.org/1999/xhtml">
	<a href="/$[dest_url]">$(title)</a></li>""")


ERROR_TEMPLATE = Template("""<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Failed</title>
<link rel="stylesheet" href="/theme/css/style.css"/>
</head>
<body>
<h1>Sorry</h1>
<p>Whatever you tried, it didn't work out:</p>
<p class="errmsg">$(msg)</p>
</body>
</html>""")


def show_error(msg):
	"""shows msg as appropriate for cgi or interactive use.
	"""
	if IS_CGI:
		ERROR_TEMPLATE.serve({"msg": msg})
	
	else:
		sys.stderr.write(msg)


def list_matches(vars):
	"""template function for producing the rendered list of results.
	"""
	parts = []

	for m in vars["matches"]:
		meta = json.loads(m.document.get_data().decode("utf-8"))
		parts.append(MATCH_TEMPLATE.render({
			"dest_url": meta["path"],
			"title": meta["title"].split(":", 1)[-1],}))

	if parts:
		return "\n".join(parts)
	
	else:
		return ("<li>Nothing, sorry.  "
			'<a href="https://blog.tfiu.de/stemming-for-the-search-engine.html">'
			'Use wildcards</a> or go to <a href="/">the main page</a>.</li>')


def cgi_main(document_dir):
	fields = cgi.FieldStorage()

	query = fields.getfirst("q")
	if query.startswith("l:"):
		parts = query.split()
		lang = parts[0][2:30]
		proc_query = " ".join(parts[1:])
	else:
		lang, proc_query = "", query

	if proc_query:
		db = xapian.Database(os.path.join(document_dir, DB_NAME))
		qp = xapian.QueryParser()
		if lang:
			qp.set_stemmer(xapian.Stem(lang))
		qp.set_database(db)

		enq = xapian.Enquire(db)
		enq.set_query(qp.parse_query(proc_query,
			qp.FLAG_DEFAULT | qp.FLAG_WILDCARD))
		matches = enq.get_mset(0, 100)

	else:
		matches = []

	RESP_TEMPLATE.serve(locals())
		

########################### Indexing

def soup_to_text(soup, parts=None):
	"""returns the navigable strings in soup concatenated by blanks.
	"""
	if soup is None:
		return ""

	if parts is None:
		parts = []
	
	for el in soup.contents:
		if isinstance(el, bs4.NavigableString):
			parts.append(el)
		else:
			parts.append(soup_to_text(el))

	return " ".join(parts)


def remove_prefix(s, prefix):
	"""returns s with prefix removed.

	If s doesn't start with prefix, it's identity.
	"""
	if s.startswith(prefix):
		return s[len(prefix):]
	else:
		return s


@functools.lru_cache(20)
def get_indexer(lang):
	"""returns an indexer for the language lang.

	lang can be anything xapian.Stem understands, and this funciton will
	raise an exception for unknown languages.

	If lang is None or empty, this returns a non-stemming indexer.
	"""
	indexer = xapian.TermGenerator()
	if lang:
		indexer.set_stemmer(xapian.Stem(lang))
	return indexer


def index_html(f_name, document_dir):
	"""returns a xapian document for an html file f_name.

	The function will only index "content" HTML files (i.e., articles
	and pages).  For non-content, it returns documents without
	terms; that's so we don't touch these files again until the database
	is removed.
	"""
	with open(f_name, encoding="utf-8") as f:
		soup = bs4.BeautifulSoup(f, "lxml")
	doc = xapian.Document()
	meta = {
		"title": soup_to_text(soup.find("title")),
		"path": remove_prefix(f_name, document_dir),
		"mtime": os.path.getmtime(f_name),}
	doc.set_data(json.dumps(meta))

	content = soup.find(class_="indexable")
	if not content:
		# only add terms if this isn't some index file or similar
		return doc
	
	lang = content.attrs.get("lang")
	indexer = get_indexer(lang)
		
	print(f"Adding/updating {meta['path']}, language {lang}")

	indexer.set_document(doc)
	indexer.index_text(soup_to_text(content))

	return doc


def get_indexed_paths(database):
	"""returns a dict of (relative) paths that have already been indexed in
	database.

	The dict's values are pairs of mtimes and docids of when the file was
	indexed.
	"""
	files_present = {}
	for doc_desc in database.postlist(""):
		meta = json.loads(
			database.get_document(doc_desc.docid).get_data().decode("utf-8"))
		files_present[meta["path"]] = (meta["mtime"], doc_desc.docid)
	return files_present


def do_index(document_dir):
	"""updates the document index.
	"""
	# make sure document_dir ends with a slash; only then will our
	# links above end up right.
	document_dir = document_dir.rstrip("/")+"/"
	database = xapian.WritableDatabase(
		os.path.join(document_dir, DB_NAME),
		xapian.DB_CREATE_OR_OPEN)
	files_present = get_indexed_paths(database)

	for dir, children, names in os.walk(document_dir):
		for name in fnmatch.filter(names, "*.html"):
			path = os.path.join(dir, name)

			# see if we've indexed it already and we don't need to re-index it
			rel_path = remove_prefix(path, document_dir)
			if rel_path in files_present:
				last_update, docid = files_present[rel_path]
				if os.path.getmtime(path)>last_update:
					database.delete_document(docid)
				else:
					continue

			doc = index_html(path, document_dir)
			if doc:
				database.add_document(doc)


if __name__=="__main__":
	try:
		if "BLOG_DIR" not in os.environ:
			raise Exception(
				"BLOG_DIR not defined: Set it to your rendered pelican dir.")

		if IS_CGI:
			cgi_main(os.environ["BLOG_DIR"])
		else:
			do_index(os.environ["BLOG_DIR"])
	except Exception as exc:
		show_error(str(exc)+"\n")
		sys.exit(1)
