doctools/spelling.py

OILS / doctools / spelling.py View on Github | oils.pub

143 lines, 86 significant

1	#!/usr/bin/env python2
2	"""spelling.py.
3
4	Filter the output of 'lynx -dump' into a list of words to spell check.
5	"""
6	from __future__ import print_function
7
8	from collections import Counter
9	import optparse
10	import re
11	import sys
12
13	from doctools.util import log
14	from typing import Iterator
15
16
17	def SplitWords(contents):
18	# type: (str) -> Iterator[str]
19	# Remove URLs so path components don't show up as words
20	contents = re.sub(r'(http\|https\|file)://\S+', '', contents)
21
22	# Take into account contractions with apostrophes
23	#
24	# - doesn't
25	# - can't
26
27	WORD_RE = re.compile(
28	r'''
29	[a-zA-Z]+
30	(?:\'t\b)? # optional contraction
31	''', re.VERBOSE)
32
33	words = WORD_RE.findall(contents)
34
35	for w in words:
36	yield w
37
38
39	def WordList(f):
40	for line in f:
41	# no special characters allowed
42	yield line.strip()
43
44
45	def Options():
46	"""Returns an option parser instance."""
47	p = optparse.OptionParser()
48	p.add_option('--known-words',
49	dest='known_words',
50	help='List of words like /usr/share/dict/words')
51	p.add_option(
52	'--more-than-bash',
53	dest='more_than_bash',
54	type=int,
55	default=0,
56	help=
57	'Expected number of cases where OSH starts more processes than bash')
58	return p
59
60
61	def main(argv):
62	o = Options()
63	opts, argv = o.parse_args(argv[1:])
64
65	action = argv[0]
66
67	if action == 'word-split':
68	contents = sys.stdin.read()
69	for w in SplitWords(contents):
70	print(w)
71
72	elif action == 'check':
73	word_files = argv[1:]
74
75	d = Counter()
76
77	for path in word_files:
78	with open(path) as f:
79	for word in WordList(f):
80	d[word] += 1
81
82	print('')
83	print('Most common words')
84	print('')
85	for word, count in d.most_common()[:20]:
86	print('%10d %s' % (count, word))
87
88	print('')
89	print('Least common words')
90	print('')
91	for word, count in d.most_common()[-20:]:
92	print('%10d %s' % (count, word))
93
94	log('%d word files', len(word_files))
95	log('%d unique words', len(d))
96
97	known_words = {}
98	with open(opts.known_words) as f:
99	for w in WordList(f):
100	known_words[w] = True
101
102	print('')
103	print('Potential Misspellings')
104	print('')
105
106	for path in word_files:
107
108	print()
109	print('\t%s' % path)
110	print()
111
112	with open(path) as f:
113	unknown = {}
114	for w in WordList(f):
115	#if d.get(word) == 1:
116	# print(word)
117	if w.lower() not in known_words:
118	unknown[w] = True
119
120	if unknown:
121	for u in sorted(unknown):
122	# only occurs once
123	if d.get(u) == 1:
124	print(u)
125	log('\t%d unknown words in %s', len(unknown), path)
126
127	# Checking algorithms:
128	#
129	# - Does it appear in the dictionary? Problem: most computer terms
130	# - Does it appear only once or twice in the whole corpus?
131	# - Is the edit distance very close to a dictinoary word?
132	# - e.g. substitutions is a typo
133
134	else:
135	raise RuntimeError('Invalid action %r' % action)
136
137
138	if __name__ == '__main__':
139	try:
140	main(sys.argv)
141	except RuntimeError as e:
142	print('FATAL: %s' % e, file=sys.stderr)
143	sys.exit(1)