doctools/search

OILS / doctools / search_index.py View on Github | oils.pub

137 lines, 82 significant

1	#!/usr/bin/env python3
2	'''
3	This tool reads in the headings on a rendered HTML doc page and produces a tree
4	of all headings (and their anchors) which can be used as a search index.
5
6	The output is a JSON object with the shape Node[] where the Node type is
7	defined as:
8
9	type Node = {
10	symbol: string;
11	children: Node[] \| undefined;
12	anchor: string;
13	};
14
15	Usage:
16
17	doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
18	'''
19
20	from html.parser import HTMLParser
21	import argparse
22	import json
23	import os
24
25
26	class FindHeadings(HTMLParser):
27	def __init__(self):
28	super().__init__()
29
30	self.stack = []
31	self.headings = []
32	self.anchor = None
33	self.heading = None
34
35	self.title = None
36	self.in_title = False
37
38	def handle_starttag(self, tag, attrs):
39	if tag == 'title':
40	self.title = ''
41	self.in_title = True
42
43	if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
44	self.stack.append({'tag': tag, 'id': None})
45	self.heading = dict(self.stack[-1])
46
47	# Preceding each header is a <a name='anchor-name'></a>
48	# Collect these anchors as link targets
49	if tag == 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
50	# This assumes that name is the first attribute on the <a></a>,
51	# which is true for our generated HTML.
52	self.anchor = attrs[0][1]
53
54	def handle_endtag(self, tag):
55	if tag == 'title':
56	self.in_title = False
57
58	if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
59	self.stack.pop()
60
61	if self.heading and 'title' in self.heading:
62	self.headings.append(self.heading)
63	self.heading = None
64
65	def handle_data(self, data):
66	if self.in_title:
67	self.title += data
68	return
69
70	if len(self.stack) == 0 or not self.anchor:
71	return
72
73	payload = data.strip()
74	if not payload:
75	return
76
77	heading = self.heading
78	if heading is None:
79	return
80
81	if 'title' in heading:
82	heading['title'] = heading['title'] + ' ' + payload
83	else:
84	heading['title'] = payload
85	heading['id'] = '#' + self.anchor
86
87	def GetSymbols(self, relpath):
88	if self.title is None:
89	return []
90
91	root_title = self.title.strip()
92
93	symbols = []
94	stack = [] # (level, node)
95
96	for heading in self.headings:
97	level = int(heading['tag'][1])
98	node = {'symbol': heading['title'], 'anchor': relpath + heading['id']}
99
100	while stack and stack[-1][0] >= level:
101	stack.pop()
102
103	if stack:
104	parent = stack[-1][1]
105	parent_children = parent.setdefault('children', [])
106	parent_children.append(node)
107	else:
108	symbols.append(node)
109
110	stack.append((level, node))
111
112	return [{'symbol': root_title, 'children': symbols, 'anchor': relpath}]
113
114
115	def main():
116	parser = argparse.ArgumentParser()
117	parser.add_argument(
118	'--base-dir', type=str, help='Base directory to reference links from'
119	)
120	parser.add_argument('html', help='HTML file to extract headings from')
121
122	args = parser.parse_args()
123
124	with open(args.html) as f:
125	source = f.read()
126
127	find_headings = FindHeadings()
128	find_headings.feed(source)
129
130	relpath = os.path.relpath(args.html, start=args.base_dir)
131	symbols = find_headings.GetSymbols(relpath)
132
133	print(json.dumps(symbols))
134
135
136	if __name__ == '__main__':
137	main()