OILS / doctools / search_index.py View on Github | oils.pub

115 lines, 70 significant
1#!/usr/bin/env python3
2
3# This tool reads in the headings on a doc/ref page and produces a list of all
4# the symbols (and their anchors) which can be used as a search index.
5#
6# Usage:
7#
8# doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
9
10from html.parser import HTMLParser
11import argparse
12import json
13import os
14
15
16class FindHeadings(HTMLParser):
17 def __init__(self):
18 super().__init__()
19
20 self.stack = []
21 self.headings = []
22 self.anchor = None
23 self.heading = None
24
25 self.title = None
26
27 def handle_starttag(self, tag, attrs):
28 if tag == 'title':
29 self.title = ''
30
31 if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
32 self.stack.append({ 'tag': tag, 'id': None })
33 self.heading = dict(self.stack[-1])
34
35 # Preceding each header is a <a name="anchor-name"></a>
36 # Collect these anchors as link targets
37 if tag in 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
38 # Note: attrs is a list [('prop1', 'value'), ('prop2', 'value')]
39 self.anchor = attrs[0][1]
40
41 def handle_endtag(self, tag):
42 if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
43 self.stack.pop()
44
45 # Some headers are empty
46 if 'title' in self.heading:
47 self.headings.append(self.heading)
48 self.heading = None
49
50 def handle_data(self, data):
51 if self.title == '':
52 self.title = data
53
54 # Ignore data outside of headers
55 if len(self.stack) == 0:
56 return
57
58 # We have to drop headers without anchors
59 if not self.anchor:
60 return
61
62 data = data.strip()
63 if not data:
64 # Some headers are empty
65 return
66
67 if 'title' in self.heading:
68 self.heading['title'] = self.heading['title'] + ' ' + data
69 else:
70 self.heading['title'] = data
71 self.heading['id'] = '#' + self.anchor
72
73 def get_symbols(self, relpath):
74 symbol = None
75 symbols = []
76
77 if not self.title:
78 return []
79
80 for heading in self.headings:
81 symbol = heading['title']
82 if heading['tag'] == 'h2':
83 symbols.append({ 'symbol': symbol, 'children': [], 'anchor': relpath + heading['id'] })
84 elif heading['tag'] == 'h3':
85 symbols[-1]['children'].append({ 'symbol': symbol, 'anchor': relpath + heading['id'] })
86
87 # Trim empty children lists to save space (saves ~4kB at time of writing)
88 for item in symbols:
89 if len(item['children']) == 0:
90 del item['children']
91
92 return [{ 'symbol': self.title, 'children': symbols, 'anchor': relpath }]
93
94
95def main():
96 parser = argparse.ArgumentParser()
97 parser.add_argument('--base-dir', type=str, help='Base directory to reference links from')
98 parser.add_argument('html', help='HTML file to extract headings from')
99
100 args = parser.parse_args()
101
102 with open(args.html) as f:
103 source = f.read()
104
105 find_headings = FindHeadings()
106 find_headings.feed(source)
107
108 relpath = os.path.relpath(args.html, start=args.base_dir)
109 symbols = find_headings.get_symbols(relpath)
110
111 print(json.dumps(symbols))
112
113
114if __name__ == '__main__':
115 main()