| 1 | #!/usr/bin/env python3
|
| 2 |
|
| 3 | # This tool reads in the headings on a doc/ref page and produces a list of all
|
| 4 | # the symbols (and their anchors) which can be used as a search index.
|
| 5 | #
|
| 6 | # Currently a WIP.
|
| 7 | #
|
| 8 | # Usage:
|
| 9 | #
|
| 10 | # doctools/search_index.py _release/VERSION/doc/ref/chap-builtin-func.html
|
| 11 |
|
| 12 | from html.parser import HTMLParser
|
| 13 | import argparse
|
| 14 | import json
|
| 15 | import os
|
| 16 |
|
| 17 |
|
| 18 | class FindHeadings(HTMLParser):
|
| 19 | def __init__(self):
|
| 20 | super().__init__()
|
| 21 |
|
| 22 | self.stack = []
|
| 23 | self.headings = []
|
| 24 | self.anchor = None
|
| 25 | self.heading = None
|
| 26 |
|
| 27 | def handle_starttag(self, tag, attrs):
|
| 28 | if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
| 29 | self.stack.append({ 'tag': tag, 'id': None })
|
| 30 | self.heading = dict(self.stack[-1])
|
| 31 |
|
| 32 | # Preceding each header is a <a name="anchor-name"></a>
|
| 33 | # Collect these anchors as link targets
|
| 34 | if tag in 'a' and len(attrs) == 1 and attrs[0][0] == 'name':
|
| 35 | # Note: attrs is a list [('prop1', 'value'), ('prop2', 'value')]
|
| 36 | self.anchor = attrs[0][1]
|
| 37 |
|
| 38 | def handle_endtag(self, tag):
|
| 39 | if len(self.stack) > 0 and self.stack[-1]['tag'] == tag:
|
| 40 | self.stack.pop()
|
| 41 |
|
| 42 | # Some headers are empty
|
| 43 | if 'title' in self.heading:
|
| 44 | self.headings.append(self.heading)
|
| 45 | self.heading = None
|
| 46 |
|
| 47 | def handle_data(self, data):
|
| 48 | # Ignore data outside of headers
|
| 49 | if len(self.stack) == 0:
|
| 50 | return
|
| 51 |
|
| 52 | # We have to drop headers without anchors
|
| 53 | if not self.anchor:
|
| 54 | return
|
| 55 |
|
| 56 | data = data.strip()
|
| 57 | if not data:
|
| 58 | # Some headers are empty
|
| 59 | return
|
| 60 |
|
| 61 | if 'title' in self.heading:
|
| 62 | self.heading['title'] = self.heading['title'] + ' ' + data
|
| 63 | else:
|
| 64 | self.heading['title'] = data
|
| 65 | self.heading['id'] = '#' + self.anchor
|
| 66 |
|
| 67 | def get_symbols(self):
|
| 68 | symbol = None
|
| 69 | symbols = []
|
| 70 | for heading in self.headings:
|
| 71 | if heading['tag'] == 'h2':
|
| 72 | symbol = heading['title']
|
| 73 | symbols.append({ 'symbol': symbol, 'anchor': heading['id'] })
|
| 74 | elif heading['tag'] == 'h3':
|
| 75 | symbols.append({ 'symbol': symbol + ' > ' + heading['title'], 'anchor': heading['id'] })
|
| 76 |
|
| 77 | return symbols
|
| 78 |
|
| 79 |
|
| 80 | def main():
|
| 81 | parser = argparse.ArgumentParser()
|
| 82 | parser.add_argument('--base-dir', type=str, help='Base directory to reference links from')
|
| 83 | parser.add_argument('html', help='HTML file to extract headings from')
|
| 84 |
|
| 85 | args = parser.parse_args()
|
| 86 |
|
| 87 | with open(args.html) as f:
|
| 88 | source = f.read()
|
| 89 |
|
| 90 | find_headings = FindHeadings()
|
| 91 | find_headings.feed(source)
|
| 92 |
|
| 93 | symbols = find_headings.get_symbols()
|
| 94 | for sym in symbols:
|
| 95 | relpath = os.path.relpath(args.html, start=args.base_dir)
|
| 96 | sym['anchor'] = relpath + sym['anchor']
|
| 97 |
|
| 98 | print(json.dumps(symbols))
|
| 99 |
|
| 100 |
|
| 101 | if __name__ == '__main__':
|
| 102 | main()
|