doctools/search

OILS / doctools / search_index.py View on Github | oils.pub

117 lines, 70 significant

1	#!/usr/bin/env python3
2
3	# This tool reads in the headings on a doc/ref page and produces a list of all
4	# the symbols (and their anchors) which can be used as a search index.
5	#
6	# Currently a WIP.
7	#
8	# Usage:
9	#
10	# doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
11
12	from html.parser import HTMLParser
13	import argparse
14	import json
15	import os
16
17
18	class FindHeadings(HTMLParser):
19	def __init__(self):
20	super().__init__()
21
22	self.stack = []
23	self.headings = []
24	self.anchor = None
25	self.heading = None
26
27	self.title = None
28
29	def handle_starttag(self, tag, attrs):
30	if tag == 'title':
31	self.title = ''
32
33	if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
34	self.stack.append({ 'tag': tag, 'id': None })
35	self.heading = dict(self.stack[-1])
36
37	# Preceding each header is a <a name="anchor-name"></a>
38	# Collect these anchors as link targets
39	if tag in 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
40	# Note: attrs is a list [('prop1', 'value'), ('prop2', 'value')]
41	self.anchor = attrs[0][1]
42
43	def handle_endtag(self, tag):
44	if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
45	self.stack.pop()
46
47	# Some headers are empty
48	if 'title' in self.heading:
49	self.headings.append(self.heading)
50	self.heading = None
51
52	def handle_data(self, data):
53	if self.title == '':
54	self.title = data
55
56	# Ignore data outside of headers
57	if len(self.stack) == 0:
58	return
59
60	# We have to drop headers without anchors
61	if not self.anchor:
62	return
63
64	data = data.strip()
65	if not data:
66	# Some headers are empty
67	return
68
69	if 'title' in self.heading:
70	self.heading['title'] = self.heading['title'] + ' ' + data
71	else:
72	self.heading['title'] = data
73	self.heading['id'] = '#' + self.anchor
74
75	def get_symbols(self, relpath: str):
76	symbol = None
77	symbols = []
78
79	if not self.title:
80	return []
81
82	for heading in self.headings:
83	symbol = heading['title']
84	if heading['tag'] == 'h2':
85	symbols.append({ 'symbol': symbol, 'children': [], 'anchor': relpath + heading['id'] })
86	elif heading['tag'] == 'h3':
87	symbols[-1]['children'].append({ 'symbol': symbol, 'anchor': relpath + heading['id'] })
88
89	# Trim empty children lists to save space (saves ~4kB at time of writing)
90	for item in symbols:
91	if len(item['children']) == 0:
92	del item['children']
93
94	return [{ 'symbol': self.title, 'children': symbols, 'anchor': relpath }]
95
96
97	def main():
98	parser = argparse.ArgumentParser()
99	parser.add_argument('--base-dir', type=str, help='Base directory to reference links from')
100	parser.add_argument('html', help='HTML file to extract headings from')
101
102	args = parser.parse_args()
103
104	with open(args.html) as f:
105	source = f.read()
106
107	find_headings = FindHeadings()
108	find_headings.feed(source)
109
110	relpath = os.path.relpath(args.html, start=args.base_dir)
111	symbols = find_headings.get_symbols(relpath)
112
113	print(json.dumps(symbols))
114
115
116	if __name__ == '__main__':
117	main()