Word analysis

Solution P4.3.6

A (fairly lengthy) program to solve this problem is given here.

import os

def read_words(filename):
    """ Read in and untag the words from the corpus file filename. """
    words = []
    with open(filename, 'r') as fi:
        for line in fi.readlines():
            # split the line into tagged words on whitespace
            for word_pos in line.split():
                try:
                    # Split the tagged word into word and part-of-speech tag
                    word, pos = word_pos.split('/')
                except ValueError:
                    # Edge-case: some "words" actually contain '/': ignore them
                    continue
                word = word.lower()
                if not word.isalpha():
                    # Only consider words consisting entirely of letters
                    continue
                if pos.startswith('fw') or pos.startswith('np'):
                    # Ignore foreign words and proper nouns
                    continue
                words.append(word)
    return words

def count_letter_pairs(words, letter_pair_count, max_word_len):
    """
    Analyse a list of words for letter-pairs and keep track of their frequency
    of occurence by length of word. letter_pair_count is a list of dictionaries
    with the index being word-length. Each dictionary is keyed by letter-pair
    and has as its value a further dictionary of word: frequency items.

    """

    for word in words:
        word_len = len(word)
        if word_len > max_word_len:
            continue
        # Loop over each contiguous letter pair in the word
        for i in range(word_len-1):
            letter_pair = word[i:i+2]
            try:
                letter_pair_dict = letter_pair_count[word_len][letter_pair]
            except KeyError:
                letter_pair_dict = letter_pair_count[word_len][letter_pair] = {}
            try:
                letter_pair_dict[word] += 1
            except KeyError:
                letter_pair_dict[word] = 1

def get_letter_pair_counts(corpus_filepaths, max_word_len=16):
    """
    Return a list of tuples representing the words containing a given
    letter pair across the corpus samples given in files corpus_filepaths.
    The index into this list is the word length, and the corresponding items
    are (letter-pair, {word: frequency} dictionary).

    """

    letter_pair_count = [{} for word_len in range(max_word_len+1)]
    for corpus_filepath in corpus_filepaths:
        words = read_words(corpus_filepath)
        count_letter_pairs(words, letter_pair_count, max_word_len)
    # 0-letter and 1-letter words don't have letter pairs:
    counts = [None, None]
    for word_len in range(2, max_word_len+1):
        counts.append(sorted(letter_pair_count[word_len].items()))

    return counts

def get_corpus_filepaths(brown_dir):
    """ Get a list of the corpus sample files from the brown_dir directory. """
    corpus_paths = []
    for filename in os.listdir(brown_dir):
        if filename.startswith('.'):
            # Ignore hidden files (e.g. temporary or backup files)
            continue
        corpus_paths.append(os.path.join(brown_dir, filename))
    return corpus_paths

brown_dir = 'brown'
corpus_filepaths = get_corpus_filepaths(brown_dir)

# Get the letter-pair counts data structure and return only those 8-letter
# words which feature each letter-pair exactly twice
counts = get_letter_pair_counts(corpus_filepaths, max_word_len=8)
for letter_pair, word_dict in counts[8]:
    if len(word_dict) == 2:
        print('{}: {}'.format(letter_pair, word_dict))

The output is:

aj: {'majority': 57, 'majestic': 10}
bn: {'abnormal': 3, 'numbness': 2}
bp: {'subparts': 1, 'subpenas': 1}
dc: {'seedcoat': 1, 'redcoats': 9}
dp: {'midpoint': 1, 'tadpoles': 1}
hh: {'withheld': 8, 'withhold': 2}
hp: {'southpaw': 5, 'fishpond': 1}
ih: {'nihilism': 1, 'nihilist': 2}
ji: {'jingling': 1, 'jiggling': 1}
kb: {'backbend': 1, 'backbone': 4}
kd: {'jackdaws': 1, 'backdrop': 2}
kf: {'thankful': 6, 'cookfire': 1}
kh: {'sinkhole': 1, 'bulkhead': 1}
km: {'stickman': 1, 'bunkmate': 1}
kr: {'bankrupt': 5, 'sickroom': 1}
kt: {'tektites': 5, 'cocktail': 25}
ku: {'skullcap': 3, 'breakups': 1}
ky: {'backyard': 2, 'skylight': 1}
mr: {'shamrock': 3, 'comrades': 10}
mt: {'boomtown': 1, 'undreamt': 1}
pb: {'snapback': 1, 'cupboard': 2}
pc: {'topcoats': 1, 'upcoming': 1}
pd: {'trapdoor': 1, 'tapdance': 1}
pf: {'campfire': 2, 'leapfrog': 2}
pw: {'shopworn': 1, 'stepwise': 3}
uk: {'leukemia': 3, 'lukewarm': 5}
ux: {'luxuries': 3, 'tuxedoed': 1}
uz: {'puzzling': 9, 'suzerain': 2}
wc: {'newcomer': 7, 'showcase': 3}
xr: {'xreserve': 1, 'xrelease': 2}
yf: {'whyfores': 1, 'joyfully': 1}
yh: {'ballyhoo': 1, 'babyhood': 1}
yu: {'yugoslav': 7, 'picayune': 1}