| 1 | #!/usr/bin/env python3
|
| 2 |
|
| 3 | # This tool reads in the headings on a doc/ref page and produces a list of all
|
| 4 | # the symbols (and their anchors) which can be used as a search index.
|
| 5 | #
|
| 6 | # Usage:
|
| 7 | #
|
| 8 | # doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
|
| 9 |
|
| 10 | from html.parser import HTMLParser
|
| 11 | import argparse
|
| 12 | import json
|
| 13 | import os
|
| 14 |
|
| 15 |
|
| 16 | class FindHeadings(HTMLParser):
|
| 17 | def __init__(self):
|
| 18 | super().__init__()
|
| 19 |
|
| 20 | self.stack = []
|
| 21 | self.headings = []
|
| 22 | self.anchor = None
|
| 23 | self.heading = None
|
| 24 |
|
| 25 | self.title = None
|
| 26 |
|
| 27 | def handle_starttag(self, tag, attrs):
|
| 28 | if tag == 'title':
|
| 29 | self.title = ''
|
| 30 |
|
| 31 | if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
| 32 | self.stack.append({ 'tag': tag, 'id': None })
|
| 33 | self.heading = dict(self.stack[-1])
|
| 34 |
|
| 35 | # Preceding each header is a <a name="anchor-name"></a>
|
| 36 | # Collect these anchors as link targets
|
| 37 | if tag in 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
|
| 38 | # Note: attrs is a list [('prop1', 'value'), ('prop2', 'value')]
|
| 39 | self.anchor = attrs[0][1]
|
| 40 |
|
| 41 | def handle_endtag(self, tag):
|
| 42 | if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
|
| 43 | self.stack.pop()
|
| 44 |
|
| 45 | # Some headers are empty
|
| 46 | if 'title' in self.heading:
|
| 47 | self.headings.append(self.heading)
|
| 48 | self.heading = None
|
| 49 |
|
| 50 | def handle_data(self, data):
|
| 51 | if self.title == '':
|
| 52 | self.title = data
|
| 53 |
|
| 54 | # Ignore data outside of headers
|
| 55 | if len(self.stack) == 0:
|
| 56 | return
|
| 57 |
|
| 58 | # We have to drop headers without anchors
|
| 59 | if not self.anchor:
|
| 60 | return
|
| 61 |
|
| 62 | data = data.strip()
|
| 63 | if not data:
|
| 64 | # Some headers are empty
|
| 65 | return
|
| 66 |
|
| 67 | if 'title' in self.heading:
|
| 68 | self.heading['title'] = self.heading['title'] + ' ' + data
|
| 69 | else:
|
| 70 | self.heading['title'] = data
|
| 71 | self.heading['id'] = '#' + self.anchor
|
| 72 |
|
| 73 | def get_symbols(self, relpath):
|
| 74 | symbol = None
|
| 75 | symbols = []
|
| 76 |
|
| 77 | if not self.title:
|
| 78 | return []
|
| 79 |
|
| 80 | for heading in self.headings:
|
| 81 | symbol = heading['title']
|
| 82 | if heading['tag'] == 'h2':
|
| 83 | symbols.append({ 'symbol': symbol, 'children': [], 'anchor': relpath + heading['id'] })
|
| 84 | elif heading['tag'] == 'h3':
|
| 85 | symbols[-1]['children'].append({ 'symbol': symbol, 'anchor': relpath + heading['id'] })
|
| 86 |
|
| 87 | # Trim empty children lists to save space (saves ~4kB at time of writing)
|
| 88 | for item in symbols:
|
| 89 | if len(item['children']) == 0:
|
| 90 | del item['children']
|
| 91 |
|
| 92 | return [{ 'symbol': self.title, 'children': symbols, 'anchor': relpath }]
|
| 93 |
|
| 94 |
|
| 95 | def main():
|
| 96 | parser = argparse.ArgumentParser()
|
| 97 | parser.add_argument('--base-dir', type=str, help='Base directory to reference links from')
|
| 98 | parser.add_argument('html', help='HTML file to extract headings from')
|
| 99 |
|
| 100 | args = parser.parse_args()
|
| 101 |
|
| 102 | with open(args.html) as f:
|
| 103 | source = f.read()
|
| 104 |
|
| 105 | find_headings = FindHeadings()
|
| 106 | find_headings.feed(source)
|
| 107 |
|
| 108 | relpath = os.path.relpath(args.html, start=args.base_dir)
|
| 109 | symbols = find_headings.get_symbols(relpath)
|
| 110 |
|
| 111 | print(json.dumps(symbols))
|
| 112 |
|
| 113 |
|
| 114 | if __name__ == '__main__':
|
| 115 | main()
|