OILS / doctools / search_index.py View on Github | oils.pub

117 lines, 70 significant
1#!/usr/bin/env python3
2
3# This tool reads in the headings on a doc/ref page and produces a list of all
4# the symbols (and their anchors) which can be used as a search index.
5#
6# Currently a WIP.
7#
8# Usage:
9#
10# doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
11
12from html.parser import HTMLParser
13import argparse
14import json
15import os
16
17
18class FindHeadings(HTMLParser):
19 def __init__(self):
20 super().__init__()
21
22 self.stack = []
23 self.headings = []
24 self.anchor = None
25 self.heading = None
26
27 self.title = None
28
29 def handle_starttag(self, tag, attrs):
30 if tag == 'title':
31 self.title = ''
32
33 if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
34 self.stack.append({ 'tag': tag, 'id': None })
35 self.heading = dict(self.stack[-1])
36
37 # Preceding each header is a <a name="anchor-name"></a>
38 # Collect these anchors as link targets
39 if tag in 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
40 # Note: attrs is a list [('prop1', 'value'), ('prop2', 'value')]
41 self.anchor = attrs[0][1]
42
43 def handle_endtag(self, tag):
44 if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
45 self.stack.pop()
46
47 # Some headers are empty
48 if 'title' in self.heading:
49 self.headings.append(self.heading)
50 self.heading = None
51
52 def handle_data(self, data):
53 if self.title == '':
54 self.title = data
55
56 # Ignore data outside of headers
57 if len(self.stack) == 0:
58 return
59
60 # We have to drop headers without anchors
61 if not self.anchor:
62 return
63
64 data = data.strip()
65 if not data:
66 # Some headers are empty
67 return
68
69 if 'title' in self.heading:
70 self.heading['title'] = self.heading['title'] + ' ' + data
71 else:
72 self.heading['title'] = data
73 self.heading['id'] = '#' + self.anchor
74
75 def get_symbols(self, relpath: str):
76 symbol = None
77 symbols = []
78
79 if not self.title:
80 return []
81
82 for heading in self.headings:
83 symbol = heading['title']
84 if heading['tag'] == 'h2':
85 symbols.append({ 'symbol': symbol, 'children': [], 'anchor': relpath + heading['id'] })
86 elif heading['tag'] == 'h3':
87 symbols[-1]['children'].append({ 'symbol': symbol, 'anchor': relpath + heading['id'] })
88
89 # Trim empty children lists to save space (saves ~4kB at time of writing)
90 for item in symbols:
91 if len(item['children']) == 0:
92 del item['children']
93
94 return [{ 'symbol': self.title, 'children': symbols, 'anchor': relpath }]
95
96
97def main():
98 parser = argparse.ArgumentParser()
99 parser.add_argument('--base-dir', type=str, help='Base directory to reference links from')
100 parser.add_argument('html', help='HTML file to extract headings from')
101
102 args = parser.parse_args()
103
104 with open(args.html) as f:
105 source = f.read()
106
107 find_headings = FindHeadings()
108 find_headings.feed(source)
109
110 relpath = os.path.relpath(args.html, start=args.base_dir)
111 symbols = find_headings.get_symbols(relpath)
112
113 print(json.dumps(symbols))
114
115
116if __name__ == '__main__':
117 main()