# import the Python packages that we will need to use later on
import re
from collections import Counter
import matplotlib.pyplot as plt


# the relevant files for this novel are stored in the directory below
from pathlib import Path
dir_pride = Path("data") / "pride_prejudice"


# specify the path of the file
novel_fulltext_path = dir_pride / "fulltext.txt"
# load the file
lines = []
with open(novel_fulltext_path, "r", encoding="utf-8") as fin:
    # read in the entire novel
    fulltext = fin.read()
print("Novel full-text has length %d characters" % len(fulltext))


fulltext = fulltext.lower()


# split based on whitespace characters and punctuation
pattern = re.compile("\W+")
all_words = pattern.split(fulltext)


keep_words = []
for word in all_words:
    if len(word) >= 2:
        keep_words.append(word)
        
num_filtered = len(all_words) - len(keep_words)
print("Filtered %d words - Kept %d words" % (num_filtered, len(keep_words)))


# use a set to find the number of unique values in a list
unique_words = set(keep_words)
print("Novel full-text has %d remaining unique words" % len(unique_words))


# turn the list of filtered words into a counter
word_freqs = Counter(keep_words)
# print the top 20 
print("Top-20 most common words in the novel are:")
for word, count in word_freqs.most_common(20):
    print("%d \t %s" % (count, word))


stopwords = set(["am", "an", "and", "are", "as", "at", "be", "been", "but", "by", "can", "could", "do", "did", 
    "for", "from", "had", "has", "have", "how", "i", "if", "in", "is", "it", "its", "me", "must", "my", "no", "not", 
    "of", "on", "one", "or", "our", "say", "said", "shall", "so", "some", "such", "that", "than", "the", 
    "them", "there", "this", "these", "to", "was", "were", "what", "when", "where", "which", "who", "why", 
    "will", "with", "would", "you", "your"])


# remove the frequencies for these words
for stopword in stopwords:
    del word_freqs[stopword]
# print the top 20 again
print("Top-20 most common words in the novel are:")
for word, count in word_freqs.most_common(20):
    print("%d \t %s" % (count, word))


# from the Counter get the top words and corresponding frequencies
top_words = []
top_freqs = [] 
for word, freq in word_freqs.most_common(20):
    top_words.append(word)
    top_freqs.append(freq)
# we have to reverse the lists to get the largest value to appear at the top of a horizontal bar chart
top_words.reverse()
top_freqs.reverse()
# now create a plot to display them 
plt.figure(figsize=(7, 7))
ax = plt.barh(top_words, top_freqs, color="darkgreen")
# add axis labels to the chart
plt.xlabel("Frequency", fontsize=13)
plt.show()


# remove further stopwords... we will just try adding a few more here
extra_stopwords = ["all", "any", "every", "know", "more", "much", "they", "very"]
for stopword in extra_stopwords:
    del word_freqs[stopword]

# recreate the plot
top_words = []
top_freqs = [] 
for word, freq in word_freqs.most_common(20):
    top_words.append(word)
    top_freqs.append(freq)
# we have to reverse the lists to get the largest value to appear at the top of a horizontal bar chart
top_words.reverse()
top_freqs.reverse()
# now create a plot to display them 
plt.figure(figsize=(7, 7))
ax = plt.barh(top_words, top_freqs, color="darkgreen")
# add axis labels to the chart
plt.xlabel("Frequency", fontsize=13)
plt.show()


# the relevant files for these novels are stored in the directories below
dir_dracula = Path("data") / "dracula"
dir_frankenstein = Path("data") / "frankenstein"


# define a function to load the text and apply the text preparation
def load_and_prepare(in_path):
    # load the file into a string
    with open(in_path, "r", encoding="utf-8") as fin:
        # read in the entire novel
        fulltext = fin.read()
    # convert it to lowercase and return it
    return fulltext.lower()


# load the text for this first novel
fulltext_path_dracula = dir_dracula / "fulltext.txt"
fulltext_dracula = load_and_prepare(fulltext_path_dracula)
# second novel
fulltext_path_frankenstein = dir_frankenstein / "fulltext.txt"
fulltext_frankenstein = load_and_prepare(fulltext_path_frankenstein)


# define a function to split the texts, filter the words, and return the top 30
def find_top30_words(fulltext):
    # find all of the words
    pattern = re.compile("\W+")
    all_words = pattern.split(fulltext)    
    # remove the short words
    keep_words = []
    for word in all_words:
        if len(word) >= 2:
            keep_words.append(word)
    # count the word frequencies
    word_freqs = Counter(keep_words)
    # remove the stopwords
    for stopword in stopwords:
        del word_freqs[stopword]
    # return the top words in a list (without their frequencies)
    top_list = []
    for word, freq in word_freqs.most_common(30):
        top_list.append(word)
    return top_list


top_dracula = find_top30_words(fulltext_dracula)
print("Top 30 most common words in the book Dracula:")
for i, word in enumerate(top_dracula):
    print("%02d) %s" % (i+1, word))


top_frankenstein = find_top30_words(fulltext_frankenstein)
print("Top 30 most common words in the book Frankenstein:")
for i, word in enumerate(top_frankenstein):
    print("%02d) %s" % (i+1, word))


# convert the lists to sets first
set_top_dracula = set(top_dracula)
set_top_frankenstein = set(top_frankenstein)

# get the words common to both (set intersection)
print("Top-30 words common to both Dracula and Frakenstein:")
print(set_top_dracula.intersection(set_top_frankenstein))
# use set difference operators

print("Top-30 words unique to Dracula:")
print(set_top_dracula.difference(set_top_frankenstein))
print("Top-30 words unique to Frakenstein:")
print(set_top_frankenstein.difference(set_top_dracula))

Worksheet 2: Word Frequencies (SOLUTION)¶

Task 1: Text Preparation¶

Task 2: Finding Words¶

Task 3: Counting Words in Full-Texts¶

Task 4: Comparing Word Frequencies in Full-Texts¶