OILS / doctools / search_index.py View on Github | oils.pub

137 lines, 82 significant
1#!/usr/bin/env python3
2'''
3This tool reads in the headings on a rendered HTML doc page and produces a tree
4of all headings (and their anchors) which can be used as a search index.
5
6The output is a JSON object with the shape Node[] where the Node type is
7defined as:
8
9type Node = {
10 symbol: string;
11 children: Node[] | undefined;
12 anchor: string;
13};
14
15Usage:
16
17 doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
18'''
19
20from html.parser import HTMLParser
21import argparse
22import json
23import os
24
25
26class FindHeadings(HTMLParser):
27 def __init__(self):
28 super().__init__()
29
30 self.stack = []
31 self.headings = []
32 self.anchor = None
33 self.heading = None
34
35 self.title = None
36 self.in_title = False
37
38 def handle_starttag(self, tag, attrs):
39 if tag == 'title':
40 self.title = ''
41 self.in_title = True
42
43 if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
44 self.stack.append({'tag': tag, 'id': None})
45 self.heading = dict(self.stack[-1])
46
47 # Preceding each header is a <a name='anchor-name'></a>
48 # Collect these anchors as link targets
49 if tag == 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
50 # This assumes that name is the first attribute on the <a></a>,
51 # which is true for our generated HTML.
52 self.anchor = attrs[0][1]
53
54 def handle_endtag(self, tag):
55 if tag == 'title':
56 self.in_title = False
57
58 if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
59 self.stack.pop()
60
61 if self.heading and 'title' in self.heading:
62 self.headings.append(self.heading)
63 self.heading = None
64
65 def handle_data(self, data):
66 if self.in_title:
67 self.title += data
68 return
69
70 if len(self.stack) == 0 or not self.anchor:
71 return
72
73 payload = data.strip()
74 if not payload:
75 return
76
77 heading = self.heading
78 if heading is None:
79 return
80
81 if 'title' in heading:
82 heading['title'] = heading['title'] + ' ' + payload
83 else:
84 heading['title'] = payload
85 heading['id'] = '#' + self.anchor
86
87 def GetSymbols(self, relpath):
88 if self.title is None:
89 return []
90
91 root_title = self.title.strip()
92
93 symbols = []
94 stack = [] # (level, node)
95
96 for heading in self.headings:
97 level = int(heading['tag'][1])
98 node = {'symbol': heading['title'], 'anchor': relpath + heading['id']}
99
100 while stack and stack[-1][0] >= level:
101 stack.pop()
102
103 if stack:
104 parent = stack[-1][1]
105 parent_children = parent.setdefault('children', [])
106 parent_children.append(node)
107 else:
108 symbols.append(node)
109
110 stack.append((level, node))
111
112 return [{'symbol': root_title, 'children': symbols, 'anchor': relpath}]
113
114
115def main():
116 parser = argparse.ArgumentParser()
117 parser.add_argument(
118 '--base-dir', type=str, help='Base directory to reference links from'
119 )
120 parser.add_argument('html', help='HTML file to extract headings from')
121
122 args = parser.parse_args()
123
124 with open(args.html) as f:
125 source = f.read()
126
127 find_headings = FindHeadings()
128 find_headings.feed(source)
129
130 relpath = os.path.relpath(args.html, start=args.base_dir)
131 symbols = find_headings.GetSymbols(relpath)
132
133 print(json.dumps(symbols))
134
135
136if __name__ == '__main__':
137 main()