| 1 | #!/usr/bin/env python3
|
| 2 |
|
| 3 | # This tool reads in the headings on a doc/ref page and produces a list of all
|
| 4 | # the symbols (and their anchors) which can be used as a search index.
|
| 5 | #
|
| 6 | # Currently a WIP.
|
| 7 | #
|
| 8 | # Usage:
|
| 9 | #
|
| 10 | # doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
|
| 11 |
|
| 12 | from html.parser import HTMLParser
|
| 13 | import argparse
|
| 14 | import json
|
| 15 | import os
|
| 16 |
|
| 17 |
|
| 18 | class FindHeadings(HTMLParser):
|
| 19 | def __init__(self):
|
| 20 | super().__init__()
|
| 21 |
|
| 22 | self.stack = []
|
| 23 | self.headings = []
|
| 24 | self.anchor = None
|
| 25 | self.heading = None
|
| 26 |
|
| 27 | self.title = None
|
| 28 |
|
| 29 | def handle_starttag(self, tag, attrs):
|
| 30 | if tag == 'title':
|
| 31 | self.title = ''
|
| 32 |
|
| 33 | if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
| 34 | self.stack.append({ 'tag': tag, 'id': None })
|
| 35 | self.heading = dict(self.stack[-1])
|
| 36 |
|
| 37 | # Preceding each header is a <a name="anchor-name"></a>
|
| 38 | # Collect these anchors as link targets
|
| 39 | if tag in 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
|
| 40 | # Note: attrs is a list [('prop1', 'value'), ('prop2', 'value')]
|
| 41 | self.anchor = attrs[0][1]
|
| 42 |
|
| 43 | def handle_endtag(self, tag):
|
| 44 | if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
|
| 45 | self.stack.pop()
|
| 46 |
|
| 47 | # Some headers are empty
|
| 48 | if 'title' in self.heading:
|
| 49 | self.headings.append(self.heading)
|
| 50 | self.heading = None
|
| 51 |
|
| 52 | def handle_data(self, data):
|
| 53 | if self.title == '':
|
| 54 | self.title = data
|
| 55 |
|
| 56 | # Ignore data outside of headers
|
| 57 | if len(self.stack) == 0:
|
| 58 | return
|
| 59 |
|
| 60 | # We have to drop headers without anchors
|
| 61 | if not self.anchor:
|
| 62 | return
|
| 63 |
|
| 64 | data = data.strip()
|
| 65 | if not data:
|
| 66 | # Some headers are empty
|
| 67 | return
|
| 68 |
|
| 69 | if 'title' in self.heading:
|
| 70 | self.heading['title'] = self.heading['title'] + ' ' + data
|
| 71 | else:
|
| 72 | self.heading['title'] = data
|
| 73 | self.heading['id'] = '#' + self.anchor
|
| 74 |
|
| 75 | def get_symbols(self, relpath):
|
| 76 | symbol = None
|
| 77 | symbols = []
|
| 78 |
|
| 79 | if not self.title:
|
| 80 | return []
|
| 81 |
|
| 82 | for heading in self.headings:
|
| 83 | symbol = heading['title']
|
| 84 | if heading['tag'] == 'h2':
|
| 85 | symbols.append({ 'symbol': symbol, 'children': [], 'anchor': relpath + heading['id'] })
|
| 86 | elif heading['tag'] == 'h3':
|
| 87 | symbols[-1]['children'].append({ 'symbol': symbol, 'anchor': relpath + heading['id'] })
|
| 88 |
|
| 89 | # Trim empty children lists to save space (saves ~4kB at time of writing)
|
| 90 | for item in symbols:
|
| 91 | if len(item['children']) == 0:
|
| 92 | del item['children']
|
| 93 |
|
| 94 | return [{ 'symbol': self.title, 'children': symbols, 'anchor': relpath }]
|
| 95 |
|
| 96 |
|
| 97 | def main():
|
| 98 | parser = argparse.ArgumentParser()
|
| 99 | parser.add_argument('--base-dir', type=str, help='Base directory to reference links from')
|
| 100 | parser.add_argument('html', help='HTML file to extract headings from')
|
| 101 |
|
| 102 | args = parser.parse_args()
|
| 103 |
|
| 104 | with open(args.html) as f:
|
| 105 | source = f.read()
|
| 106 |
|
| 107 | find_headings = FindHeadings()
|
| 108 | find_headings.feed(source)
|
| 109 |
|
| 110 | relpath = os.path.relpath(args.html, start=args.base_dir)
|
| 111 | symbols = find_headings.get_symbols(relpath)
|
| 112 |
|
| 113 | print(json.dumps(symbols))
|
| 114 |
|
| 115 |
|
| 116 | if __name__ == '__main__':
|
| 117 | main()
|