OILS / doctools / search_index.py View on Github | oils.pub

102 lines, 62 significant
1#!/usr/bin/env python3
2
3# This tool reads in the headings on a doc/ref page and produces a list of all
4# the symbols (and their anchors) which can be used as a search index.
5#
6# Currently a WIP.
7#
8# Usage:
9#
10# doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
11
12from html.parser import HTMLParser
13import argparse
14import json
15import os
16
17
18class FindHeadings(HTMLParser):
19 def __init__(self):
20 super().__init__()
21
22 self.stack = []
23 self.headings = []
24 self.anchor = None
25 self.heading = None
26
27 def handle_starttag(self, tag, attrs):
28 if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
29 self.stack.append({ 'tag': tag, 'id': None })
30 self.heading = dict(self.stack[-1])
31
32 # Preceding each header is a <a name="anchor-name"></a>
33 # Collect these anchors as link targets
34 if tag in 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
35 # Note: attrs is a list [('prop1', 'value'), ('prop2', 'value')]
36 self.anchor = attrs[0][1]
37
38 def handle_endtag(self, tag):
39 if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
40 self.stack.pop()
41
42 # Some headers are empty
43 if 'title' in self.heading:
44 self.headings.append(self.heading)
45 self.heading = None
46
47 def handle_data(self, data):
48 # Ignore data outside of headers
49 if len(self.stack) == 0:
50 return
51
52 # We have to drop headers without anchors
53 if not self.anchor:
54 return
55
56 data = data.strip()
57 if not data:
58 # Some headers are empty
59 return
60
61 if 'title' in self.heading:
62 self.heading['title'] = self.heading['title'] + ' ' + data
63 else:
64 self.heading['title'] = data
65 self.heading['id'] = '#' + self.anchor
66
67 def get_symbols(self):
68 symbol = None
69 symbols = []
70 for heading in self.headings:
71 if heading['tag'] == 'h2':
72 symbol = heading['title']
73 symbols.append({ 'symbol': symbol, 'anchor': heading['id'] })
74 elif heading['tag'] == 'h3':
75 symbols.append({ 'symbol': symbol + ' > ' + heading['title'], 'anchor': heading['id'] })
76
77 return symbols
78
79
80def main():
81 parser = argparse.ArgumentParser()
82 parser.add_argument('--base-dir', type=str, help='Base directory to reference links from')
83 parser.add_argument('html', help='HTML file to extract headings from')
84
85 args = parser.parse_args()
86
87 with open(args.html) as f:
88 source = f.read()
89
90 find_headings = FindHeadings()
91 find_headings.feed(source)
92
93 symbols = find_headings.get_symbols()
94 for sym in symbols:
95 relpath = os.path.relpath(args.html, start=args.base_dir)
96 sym['anchor'] = relpath + sym['anchor']
97
98 print(json.dumps(symbols))
99
100
101if __name__ == '__main__':
102 main()