A (fairly lengthy) program to solve this problem is given here.
import os
def read_words(filename):
""" Read in and untag the words from the corpus file filename. """
words = []
with open(filename, 'r') as fi:
for line in fi.readlines():
# split the line into tagged words on whitespace
for word_pos in line.split():
try:
# Split the tagged word into word and part-of-speech tag
word, pos = word_pos.split('/')
except ValueError:
# Edge-case: some "words" actually contain '/': ignore them
continue
word = word.lower()
if not word.isalpha():
# Only consider words consisting entirely of letters
continue
if pos.startswith('fw') or pos.startswith('np'):
# Ignore foreign words and proper nouns
continue
words.append(word)
return words
def count_letter_pairs(words, letter_pair_count, max_word_len):
"""
Analyse a list of words for letter-pairs and keep track of their frequency
of occurence by length of word. letter_pair_count is a list of dictionaries
with the index being word-length. Each dictionary is keyed by letter-pair
and has as its value a further dictionary of word: frequency items.
"""
for word in words:
word_len = len(word)
if word_len > max_word_len:
continue
# Loop over each contiguous letter pair in the word
for i in range(word_len-1):
letter_pair = word[i:i+2]
try:
letter_pair_dict = letter_pair_count[word_len][letter_pair]
except KeyError:
letter_pair_dict = letter_pair_count[word_len][letter_pair] = {}
try:
letter_pair_dict[word] += 1
except KeyError:
letter_pair_dict[word] = 1
def get_letter_pair_counts(corpus_filepaths, max_word_len=16):
"""
Return a list of tuples representing the words containing a given
letter pair across the corpus samples given in files corpus_filepaths.
The index into this list is the word length, and the corresponding items
are (letter-pair, {word: frequency} dictionary).
"""
letter_pair_count = [{} for word_len in range(max_word_len+1)]
for corpus_filepath in corpus_filepaths:
words = read_words(corpus_filepath)
count_letter_pairs(words, letter_pair_count, max_word_len)
# 0-letter and 1-letter words don't have letter pairs:
counts = [None, None]
for word_len in range(2, max_word_len+1):
counts.append(sorted(letter_pair_count[word_len].items()))
return counts
def get_corpus_filepaths(brown_dir):
""" Get a list of the corpus sample files from the brown_dir directory. """
corpus_paths = []
for filename in os.listdir(brown_dir):
if filename.startswith('.'):
# Ignore hidden files (e.g. temporary or backup files)
continue
corpus_paths.append(os.path.join(brown_dir, filename))
return corpus_paths
brown_dir = 'brown'
corpus_filepaths = get_corpus_filepaths(brown_dir)
# Get the letter-pair counts data structure and return only those 8-letter
# words which feature each letter-pair exactly twice
counts = get_letter_pair_counts(corpus_filepaths, max_word_len=8)
for letter_pair, word_dict in counts[8]:
if len(word_dict) == 2:
print('{}: {}'.format(letter_pair, word_dict))
The output is:
aj: {'majority': 57, 'majestic': 10}
bn: {'abnormal': 3, 'numbness': 2}
bp: {'subparts': 1, 'subpenas': 1}
dc: {'seedcoat': 1, 'redcoats': 9}
dp: {'midpoint': 1, 'tadpoles': 1}
hh: {'withheld': 8, 'withhold': 2}
hp: {'southpaw': 5, 'fishpond': 1}
ih: {'nihilism': 1, 'nihilist': 2}
ji: {'jingling': 1, 'jiggling': 1}
kb: {'backbend': 1, 'backbone': 4}
kd: {'jackdaws': 1, 'backdrop': 2}
kf: {'thankful': 6, 'cookfire': 1}
kh: {'sinkhole': 1, 'bulkhead': 1}
km: {'stickman': 1, 'bunkmate': 1}
kr: {'bankrupt': 5, 'sickroom': 1}
kt: {'tektites': 5, 'cocktail': 25}
ku: {'skullcap': 3, 'breakups': 1}
ky: {'backyard': 2, 'skylight': 1}
mr: {'shamrock': 3, 'comrades': 10}
mt: {'boomtown': 1, 'undreamt': 1}
pb: {'snapback': 1, 'cupboard': 2}
pc: {'topcoats': 1, 'upcoming': 1}
pd: {'trapdoor': 1, 'tapdance': 1}
pf: {'campfire': 2, 'leapfrog': 2}
pw: {'shopworn': 1, 'stepwise': 3}
uk: {'leukemia': 3, 'lukewarm': 5}
ux: {'luxuries': 3, 'tuxedoed': 1}
uz: {'puzzling': 9, 'suzerain': 2}
wc: {'newcomer': 7, 'showcase': 3}
xr: {'xreserve': 1, 'xrelease': 2}
yf: {'whyfores': 1, 'joyfully': 1}
yh: {'ballyhoo': 1, 'babyhood': 1}
yu: {'yugoslav': 7, 'picayune': 1}