SEARCH TEXTS  

Plays  +  Sonnets  +  Poems  +  Concordance  +  Advanced Search  +  About OSS

APPENDIX C: Parser source code

###########################################################################
# Shakespeare text parser
###########################################################################
# Eric M. Johnson
# July 12, 2003
#
# January 30, 2004: modified to use new database schema
#
# "Sections" = Acts
# "Chapters" = Scenes
###########################################################################

# begin timing the script
$begintime = time();

###########################################################################
# subroutine to add lines to database
###########################################################################

sub linewrite {
$writepara = $_[0];
$writeparanum = $_[1];
$writeparatype = $_[2];
$writeparasection = $_[3];
$writeparachapter = $_[4];

# identify the line type
if ($writeparatype eq '$') { $writeparatype = 's' } # stage directions
if ($writeparatype eq '%') { $writeparatype = 'b' } # blank verse -- parser can't tell difference between blank and metered verse
if ($writeparatype eq '^') { $writeparatype = 'b' } # blank verse -- parser can't tell difference between blank and metered verse

# remove leading ASCII characters for stage directions, character lines, continued lines
$writepara =~ s/[\$\%\^]//g;

# figure out who the character is, remove his name from the line
($charid, $writepara, $speechcount) = charfinger($writepara, $writeparatype);

# character count
$charcount = length($writepara);

# start by making everything lower case
$bareline = lc($writepara);

# strip out paragraph break string
$bareline =~ s/\[p\]//g;

# strip out newlines and replace with space
$bareline =~ s/\n/ /g;

# remove leading apostrophes
# insert a marker, then remove the marker and the apostrophe
$bareline =~ s/(\W')/\1APOSMARKER/g;
$bareline =~ s/'APOSMARKER//g;

# remove trailing apostrophes
# insert a marker, then remove the marker and the apostrophe
$bareline =~ s/('\W)/APOSMARKER\1/g;
$bareline =~ s/APOSMARKER'//g;

# replace emdashes with space
$bareline =~ s/\-\-/ /g;

# replace apostrophes with marker
$bareline =~ s/'/APOSMARKER/g;

# replace hyphens with marker
$bareline =~ s/\-/HYPHENMARKER/g;

# strip all non-alphanumeric characters
$bareline =~ s/[^a-zA-Z\s]//g;

# strip whitespace at the beginning of the line
$bareline =~ s/^\s+//;

# strip whitespace at the end of the line
$bareline =~ s/[ ]*\n//;

# strip multiple spaces
$bareline =~ s/\s+/ /g;

# split the line into words and count them
@words = split(/ |\n/, $bareline);
$wordcount = scalar(@words);
# add to the work's wordcount
$workwordcount = $workwordcount + $wordcount;

# get the stems and metaphone values of each word on the line
# first, clear the values, leaving a leading space for the stem and phonetic paragraph versions
$stemgraph = ' ';
$phonegraph = ' ';
$currentword = 0;

###########################################################################
# Begin processing word-by-word
###########################################################################
foreach $word (@words) {
# first, make sure we're not inserting a blank word
if ($word ne '') {
# increment the word count
$currentword++;

# remove apostrophe at beginning of word
$word =~ s/^APOSMARKER//g;

# remove hyphen at end of word
$word =~ s/HYPHENMARKER$//g;

# replace apostrophe and hyphen markers with real characters
$word =~ s/APOSMARKER/'/g;
$word =~ s/HYPHENMARKER/\-/g;

# add the word to the wordforms hash
$wordforms{$word}++;

# get stem and metaphone values
$bareword = $word;
$bareword =~ s/[^a-z]//g; # strip unacceptable characters

$stemword = Lingua::Stem::En::stem({-words => [$bareword]}) ;
$metaphoneword = Metaphone($bareword);

$stemgraph .= $stemword->[0] . " ";
$phonegraph .= $metaphoneword . " ";

# make sure all apostrophes will be acceptable for SQL
$word =~ s/[']/''/g;

}
}

# modify apostrophes to make it acceptable to SQL
$writepara =~ s/\'/\'\'/g;

# write a new line to the db
$sqlstatement = "INSERT INTO Paragraphs (WorkID, CharID, PlainText, StemText, PhoneticText, ParagraphNum, ParagraphType, Section, Chapter, CharCount, WordCount) " .
"VALUES ('$currentwork', '$charid', '$writepara', '$stemgraph', '$phonegraph', $writeparanum, '$writeparatype', $writeparasection, $writeparachapter, $charcount, $wordcount)";
if ($db->sql($sqlstatement)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
die "\nDied while trying to write line $writeparanum\n$sqlstatement\n";
}
# increment the speech count and store it
$speechcount++;
$sqlstatement = "UPDATE Characters
SET SpeechCount=$speechcount
WHERE CharID = '$charid'";
#print "$sqlstatement\n\n";
if ($db->sql($sqlstatement)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
die "\nDied while trying to update the speech count on line $writeparanum\n$sqlstatement\n";
}
$totalparagraphs++;
}

###########################################################################
# subroutine to figure out whose line it is, anyway
###########################################################################
sub charfinger {
$tempcharline = $_[0];
$tempcharparagraphtype = $_[1];

if ($tempcharparagraphtype ne 's') {
# get the chartemp value
$pdloc = index($tempcharline, ".");
$chartemp = substr($tempcharline, 0, $pdloc);
$tempcharline = substr($tempcharline, $pdloc + 2);

$charid = '';

if ($chartemp eq 'xxx') {
$charid = 'xxx';
}
else {
# get character info from db
$getcharinfo = "SELECT *
FROM Characters
WHERE Works
LIKE '%$currentwork%'
AND Abbrev='$chartemp'";
if ($db->sql($getcharinfo)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
die;
}
else
{
if ($db->FetchRow()) {
my(%currentrow) = $db->DataHash();
$charid = $currentrow{CharID};
$charname = $currentrow{CharName};
$abbrev = $currentrow{Abbrev};
$speechcount = $currentrow{SpeechCount};
}
else
{
die "Character not found! Died at $writeparanum\nchartemp:$chartemp\ncurrentline=$currentline\nlinecounter=$.";
}
}
}
}
else
{
$charid = 'xxx' # this is for stage direction lines
}

# tell it who it is, otherwise return an error
if ($charid) {
#print "[$textlinecount]CharID: $charid\n";
}
else
{
print "[$textlinecount]Character not identified\n";
$noid++;
}
return $charid, $tempcharline, $speechcount;
}

###########################################################################
# subroutine to add new chapter
###########################################################################

sub addchapter {
$newsection = $_[0];
$newchapter = $_[1];
$description = $_[2];

# make apostrophes acceptable to SQL
$description =~ s/\'/\&\#8217\;/g;

# write new chapter to the db
$sqlstatement = "INSERT INTO Chapters(WorkID, Section, Chapter, Description) " .
"VALUES ('$currentwork', $newsection, $newchapter, '$description')";
#print "$sqlstatement\n\n";
if ($db->sql($sqlstatement)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
die "\nDied at Section $newsection, Chapter $newchapter. Check to see if stage directions are on the same line as the chapter indicator.";
}
}

###########################################################################
# set up database connections
###########################################################################
use Win32::ODBC;
$db = new Win32::ODBC("oss");

###########################################################################
# open the language modules
###########################################################################
use Text::Metaphone;
use Lingua::Stem qw(stem);

###########################################################################
# delete all existing wordforms
###########################################################################
$sqlstatement = "DELETE From WordForms";
if ($db->sql($sqlstatement)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
die "\nDied trying to delete all rows in the WordForm table";
}

###########################################################################
# variable population
###########################################################################

# populate all the Works if they are not specified on the command line
if (@ARGV) {
@worklist = @ARGV;
}
else
{
# get all works because no particular work was specified on the command line
$getworks = "SELECT WorkID
FROM Works
ORDER BY Title";
if ($db->sql($getworks)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
die;
}
else
{
while ($db->FetchRow()) {
my(%currentrow) = $db->DataHash();
$worklist[$workcount] = $currentrow{WorkID};
$workcount++;
}
}
# remove the speech counts
$sqlstatement = "UPDATE Characters
SET SpeechCount=0";
#print "$sqlstatement\n\n";
if ($db->sql($sqlstatement)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
die "\nDied while trying to erase the speech counts.\n";
}
}

# reset the workcount to zero
$totalworks = 0;

# start with Section 0, Chapter 1
$currentsection = 0;
$currentchapter = 0;

# flag for whether a line should be appended to a previous one
$appline = 0;

###########################################################################
# Main body of program
# Loop through each line, and parse according to what kind of line it is
###########################################################################

foreach $currentwork (@worklist) {

# reset counter variables
$noid = 0;
$totalparagraphs = 0;
$changelines = 0;
$charlinecount = 0;
$continuedlines = 0;
$textlinecount = 1;
$appline = 0;
$workwordcount = 0;

# get current work's title
$getworkinfo = "SELECT Title
FROM Works
WHERE WorkID='$currentwork'";
if ($db->sql($getworkinfo)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
die "Could not get information about work $currentwork.";
}
else
{
while ($db->FetchRow()) {
my(%workinfo) = $db->DataHash();
$worktitle = $workinfo{'Title'};
}
}

# start timing for this work
$workbegintime = time();
# delete old rows in Paragraphs table
$sqlstatement = "DELETE * FROM Paragraphs WHERE WorkID='$currentwork'";
print "\n------------------------------------------------\n";
print uc($worktitle);
print "\n------------------------------------------------\n";

if ($db->sql($sqlstatement)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
die
}
# delete old rows in Chapters for this play
$sqlstatement = "DELETE * FROM Chapters WHERE WorkID='$currentwork'";
if ($db->sql($sqlstatement)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
die
}

$TEXTFILE = "\\oss\\texts\\parsing\\$currentwork.txt";
open TEXTFILE or die "Can't open file $TEXTFILE\n";

# line we're working on, if a character's line goes more than two lines
$pendingline = '';
$pendingparagraphnum = 0;

foreach $currentline () {
$addline = 1;

# get the first byte of the line, to determine what kind of line it is
$linekind = substr($currentline, 0, 1);

# stage direction lines
if ($linekind eq '$') {
$changelines++;

# is this a chapter or act change?
if (substr($currentline, 1, 7) eq "SECTION") {
$currentsection = substr($currentline, 9, 1);
# drop this line because it isn't needed
$addline = 0;
}
if (substr($currentline, 1, 7) eq "CHAPTER") {
# find where the period is, which is the indicator of where the scene number ends
$periodpos = index $currentline, ".", 7;

# figure out how many digits there are in the chapter
$numsize = $periodpos - 9;

$currentchapter = substr($currentline, 9, $numsize);

# extract setting info, chomp the paragraph break
$description = substr($currentline, 11+$numsize, length($currentline)-13);

# add the chapter to the db
addchapter($currentsection, $currentchapter, $description);

# drop this line because it isn't needed
$addline = 0;
}

if ($addline eq 1) {
# write current line to database unless this is a section or chapter indication line
if ($appline ne 0) {
linewrite($currentline, $textlinecount, $linekind, $currentsection, $currentchapter);
}
else
{
# write pending line to database
linewrite($pendingline, $pendingparagraphnum, $pendinglinekind, $pendingsection, $pendingchapter);

# clear pending line
$pendingline = '';
$pendingparagraphnum = 0;
$pendinglinekind = '';
$pendingsection = 0;
$pendingchapter = 0;

# write new line to database
linewrite($currentline, $textlinecount, $linekind, $currentsection, $currentchapter);
}
$appline = 0;
}
}

# Beginning of character lines
if ($linekind eq '%') {
$charlinecount++;

if ($appline ne 0) {
#write pending line to database
linewrite($pendingline, $pendingparagraphnum, $pendinglinekind, $pendingsection, $pendingchapter);

#clear old line
$pendingline = '';
$pendingparagraphnum = 0;
$pendinglinekind = '';
$pendingsection = 0;
$pendingchapter = 0;
}
# populate the pending line data with the current line
$pendingline = $currentline;
$pendingparagraphnum = $textlinecount;
$pendinglinekind = $linekind;
$pendingsection = $currentsection;
$pendingchapter = $currentchapter;
$appline = 1;
}

if ($linekind eq '^') {
$continuedlines++;
$pendingline = "$pendingline\[p\]$currentline";
}
# add the addline variable, which says whether we should increment the line count
$textlinecount = $textlinecount + $addline;
}

# write last pending line if it's still there
if ($pendingline) {
#write pending line to database
linewrite($pendingline, $pendingparagraphnum, $pendinglinekind, $pendingsection, $pendingchapter);
$textlinecount++;
}

# Show report data
print "Total lines processed: " . ($textlinecount + $changelines) . "\n";
print " Chapter/scene change lines: $changelines\n";
#print " Character lines paragraphs: $charlinecount\n";
#print " Continued paragraphs: $continuedlines\n";
$subtotal = $changelines + $charlinecount + $continuedlines;
#print "Subtotal: $subtotal\n";

# show total words, paragraphs
print "Total words: $workwordcount\n";
print "Total paragraphs: $totalparagraphs\n";

# update the database with total words and total paragraphs
$sqlstatement = "UPDATE Works
SET TotalWords=$workwordcount,
TotalParagraphs=$totalparagraphs
WHERE WorkID = '$currentwork'";
#print "$sqlstatement\n\n";
if ($db->sql($sqlstatement)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
die "\nDied while trying to update the word and paragraph totals on line $writeparanum\n$sqlstatement\n";
}

# close the file that was just parsed
close TEXTFILE;

# increment the works counter
$totalworks++;

# end timing for this work
$workendtime = time();
$workexectime = $workendtime - $workbegintime;
$minutes = int($workexectime / 60);
$seconds = sprintf("%02d", $workexectime - ($minutes * 60));
print "Execution time for this work $minutes:$seconds\n";

# show cumulative timing thus far
$cumulativetime = time() - $begintime;
$minutes = int($cumulativetime / 60);
$seconds = sprintf("%02d", $cumulativetime - ($minutes * 60));
print "Cumulative execution time $minutes:$seconds\n";
}

# show the word forms, add them to db
foreach $word (sort by_count keys %wordforms) {
#print "$word occurs $wordforms{$word} times\n";
# start by stripping unacceptable characters
$bareword = $word;
$bareword =~ s/[^a-z]//g;

# determine the stem and phonetic value of the word
$stemword = Lingua::Stem::En::stem({-words => [$bareword]}) ;
$metaphoneword = Metaphone($bareword);

# count occurences
$occurences = $wordforms{$word};

# make sure all apostrophes will be acceptable for SQL
$word =~ s/[']/''/g;
$stemword[0] =~ s/[']/''/g;

# create a new entry in the WordForms table
$addwordquery = "
INSERT INTO WordForms (PlainText, PhoneticText, StemText, Occurences)
VALUES ('$word', '$metaphoneword', '$stemword->[0]', $occurences)";
if ($db->sql($addwordquery)) {
my(@err) = $db->Error;
print "sql() ERROR\n";
print "@err\n";
print "currentword = $currentword\n$bareline\naddwordquery=$addwordquery";
die;
}
}

sub by_count {
$wordforms{$b} <=> $wordforms{$a};
}

###########################################################################
# Housecleaning
###########################################################################

# close the database connection
$db->Close();

# get the ending time and display execution time
$endtime = time();
$exectime = $endtime - $begintime;
$minutes = int($exectime / 60);
$seconds = $exectime - ($minutes * 60);
print "\n////////////////////////////////////////////////\n";
print "Works processed: $totalworks\n";

$minutes = int($exectime / 60);
$seconds = sprintf("%02d", $exectime - ($minutes * 60));
print "Total processing time $minutes:$seconds\n";

$avgtime = ($exectime / $totalworks);
$minutes = int($avgtime / 60);
$seconds = sprintf("%02d", $avgtime - ($minutes * 60));
print "Average time per work $minutes:$seconds\n"