doctools/search

OILS / doctools / search_index.py View on Github | oils.pub

102 lines, 62 significant

1	#!/usr/bin/env python3
2
3	# This tool reads in the headings on a doc/ref page and produces a list of all
4	# the symbols (and their anchors) which can be used as a search index.
5	#
6	# Currently a WIP.
7	#
8	# Usage:
9	#
10	# doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
11
12	from html.parser import HTMLParser
13	import argparse
14	import json
15	import os
16
17
18	class FindHeadings(HTMLParser):
19	def __init__(self):
20	super().__init__()
21
22	self.stack = []
23	self.headings = []
24	self.anchor = None
25	self.heading = None
26
27	def handle_starttag(self, tag, attrs):
28	if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
29	self.stack.append({ 'tag': tag, 'id': None })
30	self.heading = dict(self.stack[-1])
31
32	# Preceding each header is a <a name="anchor-name"></a>
33	# Collect these anchors as link targets
34	if tag in 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
35	# Note: attrs is a list [('prop1', 'value'), ('prop2', 'value')]
36	self.anchor = attrs[0][1]
37
38	def handle_endtag(self, tag):
39	if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
40	self.stack.pop()
41
42	# Some headers are empty
43	if 'title' in self.heading:
44	self.headings.append(self.heading)
45	self.heading = None
46
47	def handle_data(self, data):
48	# Ignore data outside of headers
49	if len(self.stack) == 0:
50	return
51
52	# We have to drop headers without anchors
53	if not self.anchor:
54	return
55
56	data = data.strip()
57	if not data:
58	# Some headers are empty
59	return
60
61	if 'title' in self.heading:
62	self.heading['title'] = self.heading['title'] + ' ' + data
63	else:
64	self.heading['title'] = data
65	self.heading['id'] = '#' + self.anchor
66
67	def get_symbols(self):
68	symbol = None
69	symbols = []
70	for heading in self.headings:
71	if heading['tag'] == 'h2':
72	symbol = heading['title']
73	symbols.append({ 'symbol': symbol, 'anchor': heading['id'] })
74	elif heading['tag'] == 'h3':
75	symbols.append({ 'symbol': symbol + ' > ' + heading['title'], 'anchor': heading['id'] })
76
77	return symbols
78
79
80	def main():
81	parser = argparse.ArgumentParser()
82	parser.add_argument('--base-dir', type=str, help='Base directory to reference links from')
83	parser.add_argument('html', help='HTML file to extract headings from')
84
85	args = parser.parse_args()
86
87	with open(args.html) as f:
88	source = f.read()
89
90	find_headings = FindHeadings()
91	find_headings.feed(source)
92
93	symbols = find_headings.get_symbols()
94	for sym in symbols:
95	relpath = os.path.relpath(args.html, start=args.base_dir)
96	sym['anchor'] = relpath + sym['anchor']
97
98	print(json.dumps(symbols))
99
100
101	if __name__ == '__main__':
102	main()