# import the Python packages that we will need to use later on
import re
import matplotlib.pyplot as plt


# the relevant files for this novel are stored in the directory below
from pathlib import Path
dir_pride = Path("data") / "pride_prejudice"


# specify the path of the file
novel_fulltext_path = dir_pride / "fulltext.txt"
# open the file and parse all of the JSON data
with open(novel_fulltext_path, "r", encoding="utf-8") as fin:
    # read in the entire novel
    text = fin.read()
    # split it into lines
    lines = text.splitlines()
# how many lines of text do we have?
print("Novel has %d total lines of text" % len(lines))


count_non_empty = 0
for line in lines:
    # remove whitespace
    line = line.strip()
    # not blank?
    if len(line) > 0:
        count_non_empty += 1
print("Novel has %d non-empty lines of text" % count_non_empty)


# we will use a regular expression pattern which we want to match for the start of a chapter
pattern = re.compile("CHAPTER ([0-9]+)")

# now process all of the lines in the novel, store the lines for each chapter in a dictionary
chapter_lines = {}
current_chapter_number = 0
for line in lines:
    # if this is an empty line, we will ignore it
    if len(line) == 0:
        continue
    # does this line match the start of a chapter?
    match = pattern.match(line)
    if match:
        # if we have a match, get the chapter number
        chapter_string = match.groups()[0]
        # convert it to a number
        current_chapter_number = int(chapter_string)
        # create a list to store the lines for this chapter
        chapter_lines[current_chapter_number] = []
    # add the line to the current chapter's lines
    chapter_lines[current_chapter_number].append(line)
    
print("Novel has %d chapters" % current_chapter_number)


chapter_line_counts = []
for chapter_number in chapter_lines:
    # check the number of lines for this chapter
    chapter_line_counts.append(len(chapter_lines[chapter_number]))
# get the average of all line counts
average_chapter_lines = sum(chapter_line_counts)/len(chapter_line_counts)
print("Novel has an average of %.1f non-empty lines per chapter" % average_chapter_lines)


# create the labels for the plot - these are the chapter numbers
chapter_numbers = list(range(1, current_chapter_number+1))
# generate a line plot from the chapter length counts
plt.figure(figsize=(10, 5))
ax = plt.plot(chapter_numbers, chapter_line_counts, color="darkorange", lw=2)
# add axis labels to the plot
plt.xlabel("Chapter", fontsize=13);
plt.ylabel("Number of Non-Empty Lines", fontsize=13);
# adjust the range for the y-axis, so it starts at 0
plt.ylim(0)
# adjust the range for the x-axis
plt.xlim([1, max(chapter_numbers)])
plt.show();


# the relevant files for these novels are stored in the directories below
dir_pride = Path("data") / "pride_prejudice"
dir_dracula = Path("data") / "dracula"
dir_frankenstein = Path("data") / "frankenstein"


# create a function to load a file and count non-empty lines
def count_non_empty(in_path):
    num_non_empty = 0
    with open(in_path, "r", encoding="utf-8") as fin:
        text = fin.read()
        lines = text.splitlines()
        for line in lines:
            if len(line.strip()) > 0:
                num_non_empty += 1
    return num_non_empty


# apply for first novel
fulltext_path_pride = dir_pride / "fulltext.txt"
count_pride = count_non_empty(fulltext_path_pride)
print("Pride and Prejudice: %d non-empty lines" % count_pride)
# second novel
fulltext_path_dracula = dir_dracula / "fulltext.txt"
count_dracula = count_non_empty(fulltext_path_dracula)
print("Dracula: %d non-empty lines" % count_dracula)
# third novel
fulltext_path_frankenstein = dir_frankenstein / "fulltext.txt"
count_frankenstein = count_non_empty(fulltext_path_frankenstein)
print("Frankenstein: %d non-empty lines" % count_frankenstein)


novel_names = ["Pride and Prejudice", "Dracula", "Frankenstein"]
line_counts = [count_pride, count_dracula, count_frankenstein]
plt.figure(figsize=(7, 5))
ax = plt.bar(novel_names, line_counts, color="navy")
# add axis labels to the chart
plt.xlabel("Novel", fontsize=13);
plt.ylabel("Number of Non-Empty Lines", fontsize=13)
plt.show();

Worksheet 1: Handling Novel Full-Texts (SOLUTION)¶

Task 1: Loading Text¶

Task 2: Splitting Chapters¶

Bonus Task: Comparing Full-Texts¶