OILS / doctools / spelling.py View on Github | oils.pub

143 lines, 86 significant
1#!/usr/bin/env python2
2"""spelling.py.
3
4Filter the output of 'lynx -dump' into a list of words to spell check.
5"""
6from __future__ import print_function
7
8from collections import Counter
9import optparse
10import re
11import sys
12
13from doctools.util import log
14from typing import Iterator
15
16
17def SplitWords(contents):
18 # type: (str) -> Iterator[str]
19 # Remove URLs so path components don't show up as words
20 contents = re.sub(r'(http|https|file)://\S+', '', contents)
21
22 # Take into account contractions with apostrophes
23 #
24 # - doesn't
25 # - can't
26
27 WORD_RE = re.compile(
28 r'''
29 [a-zA-Z]+
30 (?:\'t\b)? # optional contraction
31 ''', re.VERBOSE)
32
33 words = WORD_RE.findall(contents)
34
35 for w in words:
36 yield w
37
38
39def WordList(f):
40 for line in f:
41 # no special characters allowed
42 yield line.strip()
43
44
45def Options():
46 """Returns an option parser instance."""
47 p = optparse.OptionParser()
48 p.add_option('--known-words',
49 dest='known_words',
50 help='List of words like /usr/share/dict/words')
51 p.add_option(
52 '--more-than-bash',
53 dest='more_than_bash',
54 type=int,
55 default=0,
56 help=
57 'Expected number of cases where OSH starts more processes than bash')
58 return p
59
60
61def main(argv):
62 o = Options()
63 opts, argv = o.parse_args(argv[1:])
64
65 action = argv[0]
66
67 if action == 'word-split':
68 contents = sys.stdin.read()
69 for w in SplitWords(contents):
70 print(w)
71
72 elif action == 'check':
73 word_files = argv[1:]
74
75 d = Counter()
76
77 for path in word_files:
78 with open(path) as f:
79 for word in WordList(f):
80 d[word] += 1
81
82 print('')
83 print('Most common words')
84 print('')
85 for word, count in d.most_common()[:20]:
86 print('%10d %s' % (count, word))
87
88 print('')
89 print('Least common words')
90 print('')
91 for word, count in d.most_common()[-20:]:
92 print('%10d %s' % (count, word))
93
94 log('%d word files', len(word_files))
95 log('%d unique words', len(d))
96
97 known_words = {}
98 with open(opts.known_words) as f:
99 for w in WordList(f):
100 known_words[w] = True
101
102 print('')
103 print('Potential Misspellings')
104 print('')
105
106 for path in word_files:
107
108 print()
109 print('\t%s' % path)
110 print()
111
112 with open(path) as f:
113 unknown = {}
114 for w in WordList(f):
115 #if d.get(word) == 1:
116 # print(word)
117 if w.lower() not in known_words:
118 unknown[w] = True
119
120 if unknown:
121 for u in sorted(unknown):
122 # only occurs once
123 if d.get(u) == 1:
124 print(u)
125 log('\t%d unknown words in %s', len(unknown), path)
126
127 # Checking algorithms:
128 #
129 # - Does it appear in the dictionary? Problem: most computer terms
130 # - Does it appear only once or twice in the whole corpus?
131 # - Is the edit distance very close to a dictinoary word?
132 # - e.g. substitutions is a typo
133
134 else:
135 raise RuntimeError('Invalid action %r' % action)
136
137
138if __name__ == '__main__':
139 try:
140 main(sys.argv)
141 except RuntimeError as e:
142 print('FATAL: %s' % e, file=sys.stderr)
143 sys.exit(1)