doctools/search

OILS / doctools / search_index.py View on Github | oils.pub

115 lines, 70 significant

1	#!/usr/bin/env python3
2
3	# This tool reads in the headings on a doc/ref page and produces a list of all
4	# the symbols (and their anchors) which can be used as a search index.
5	#
6	# Usage:
7	#
8	# doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
9
10	from html.parser import HTMLParser
11	import argparse
12	import json
13	import os
14
15
16	class FindHeadings(HTMLParser):
17	def __init__(self):
18	super().__init__()
19
20	self.stack = []
21	self.headings = []
22	self.anchor = None
23	self.heading = None
24
25	self.title = None
26
27	def handle_starttag(self, tag, attrs):
28	if tag == 'title':
29	self.title = ''
30
31	if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
32	self.stack.append({ 'tag': tag, 'id': None })
33	self.heading = dict(self.stack[-1])
34
35	# Preceding each header is a <a name="anchor-name"></a>
36	# Collect these anchors as link targets
37	if tag in 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
38	# Note: attrs is a list [('prop1', 'value'), ('prop2', 'value')]
39	self.anchor = attrs[0][1]
40
41	def handle_endtag(self, tag):
42	if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
43	self.stack.pop()
44
45	# Some headers are empty
46	if 'title' in self.heading:
47	self.headings.append(self.heading)
48	self.heading = None
49
50	def handle_data(self, data):
51	if self.title == '':
52	self.title = data
53
54	# Ignore data outside of headers
55	if len(self.stack) == 0:
56	return
57
58	# We have to drop headers without anchors
59	if not self.anchor:
60	return
61
62	data = data.strip()
63	if not data:
64	# Some headers are empty
65	return
66
67	if 'title' in self.heading:
68	self.heading['title'] = self.heading['title'] + ' ' + data
69	else:
70	self.heading['title'] = data
71	self.heading['id'] = '#' + self.anchor
72
73	def get_symbols(self, relpath):
74	symbol = None
75	symbols = []
76
77	if not self.title:
78	return []
79
80	for heading in self.headings:
81	symbol = heading['title']
82	if heading['tag'] == 'h2':
83	symbols.append({ 'symbol': symbol, 'children': [], 'anchor': relpath + heading['id'] })
84	elif heading['tag'] == 'h3':
85	symbols[-1]['children'].append({ 'symbol': symbol, 'anchor': relpath + heading['id'] })
86
87	# Trim empty children lists to save space (saves ~4kB at time of writing)
88	for item in symbols:
89	if len(item['children']) == 0:
90	del item['children']
91
92	return [{ 'symbol': self.title, 'children': symbols, 'anchor': relpath }]
93
94
95	def main():
96	parser = argparse.ArgumentParser()
97	parser.add_argument('--base-dir', type=str, help='Base directory to reference links from')
98	parser.add_argument('html', help='HTML file to extract headings from')
99
100	args = parser.parse_args()
101
102	with open(args.html) as f:
103	source = f.read()
104
105	find_headings = FindHeadings()
106	find_headings.feed(source)
107
108	relpath = os.path.relpath(args.html, start=args.base_dir)
109	symbols = find_headings.get_symbols(relpath)
110
111	print(json.dumps(symbols))
112
113
114	if __name__ == '__main__':
115	main()