Application: Making a Lexicon

(Section 13.5)

Packages

Make sure these are attached:

library(tidyverse)

A Lexicon for the Wizard of Oz

A Lexicon

A lexicon is any list of words—usually the words of a language or of some body of texts produced in that language.

Let’s make a lexicon for The Wizard of Oz.

Download and Read In

ozURL <- "http://www.gutenberg.org/cache/epub/55/pg55.txt"
if (!(file.exists("downloads/oz.txt"))) {
  download.file(
    url = ozURL,
    destfile = "downloads/oz.txt"
  )
}
oz <- readLines(con = "downloads/oz.txt")

Take some time to look through oz.txt online..

Strip Spurious Material

# a helper function
findIndex <- function(pattern, text) {
  str_detect(text, pattern = pattern) %>% which()
}

# now find lines to start and end at:
firstLine <- findIndex("^\\*\\*\\* START OF THIS PROJECT GUTENBERG", oz) + 1
lastLine <- findIndex("^End of Project Gutenberg's", oz)  - 1

# trim oz to the desired text:
oz2 <- oz[firstLine:lastLine]

First Pass at the Words of Oz

ozwds <- 
  oz2 %>% 
  str_split(pattern = "\\s+") %>% 
  unlist()
ozwds[1:1000]
   [1] ""               ""               ""               ""              
   [5] ""               ""               ""               ""              
   [9] ""               ""               ""               ""              
  [13] ""               ""               ""               "The"           
  [17] "Wonderful"      "Wizard"         "of"             "Oz"            
  [21] ""               ""               "by"             ""              
  [25] "L."             "Frank"          "Baum"           ""              
  [29] ""               ""               ""               "Contents"      
  [33] ""               ""               "Introduction"   ""              
  [37] "1."             "The"            "Cyclone"        ""              
  [41] "2."             "The"            "Council"        "with"          
  [45] "the"            "Munchkins"      ""               "3."            
  [49] "How"            "Dorothy"        "Saved"          "the"           
  [53] "Scarecrow"      ""               "4."             "The"           
  [57] "Road"           "Through"        "the"            "Forest"        
  [61] ""               "5."             "The"            "Rescue"        
  [65] "of"             "the"            "Tin"            "Woodman"       
  [69] ""               "6."             "The"            "Cowardly"      
  [73] "Lion"           ""               "7."             "The"           
  [77] "Journey"        "to"             "the"            "Great"         
  [81] "Oz"             ""               "8."             "The"           
  [85] "Deadly"         "Poppy"          "Field"          ""              
  [89] "9."             "The"            "Queen"          "of"            
  [93] "the"            "Field"          "Mice"           ""              
  [97] "10."            "The"            "Guardian"       "of"            
 [101] "the"            "Gates"          ""               "11."           
 [105] "The"            "Emerald"        "City"           "of"            
 [109] "Oz"             ""               "12."            "The"           
 [113] "Search"         "for"            "the"            "Wicked"        
 [117] "Witch"          ""               "13."            "The"           
 [121] "Rescue"         ""               "14."            "The"           
 [125] "Winged"         "Monkeys"        ""               "15."           
 [129] "The"            "Discovery"      "of"             "Oz"            
 [133] "the"            "Terrible"       ""               "16."           
 [137] "The"            "Magic"          "Art"            "of"            
 [141] "the"            "Great"          "Humbug"         ""              
 [145] "17."            "How"            "the"            "Balloon"       
 [149] "Was"            "Launched"       ""               "18."           
 [153] "Away"           "to"             "the"            "South"         
 [157] ""               "19."            "Attacked"       "by"            
 [161] "the"            "Fighting"       "Trees"          ""              
 [165] "20."            "The"            "Dainty"         "China"         
 [169] "Country"        ""               "21."            "The"           
 [173] "Lion"           "Becomes"        "the"            "King"          
 [177] "of"             "Beasts"         ""               "22."           
 [181] "The"            "Country"        "of"             "the"           
 [185] "Quadlings"      ""               "23."            "Glinda"        
 [189] "The"            "Good"           "Witch"          "Grants"        
 [193] "Dorothy's"      "Wish"           ""               "24."           
 [197] "Home"           "Again"          ""               ""              
 [201] ""               ""               "Introduction"   ""              
 [205] ""               "Folklore,"      "legends,"       "myths"         
 [209] "and"            "fairy"          "tales"          "have"          
 [213] "followed"       "childhood"      "through"        "the"           
 [217] "ages,"          "for"            "every"          "healthy"       
 [221] "youngster"      "has"            "a"              "wholesome"     
 [225] "and"            "instinctive"    "love"           "for"           
 [229] "stories"        "fantastic,"     "marvelous"      "and"           
 [233] "manifestly"     "unreal."        "The"            "winged"        
 [237] "fairies"        "of"             "Grimm"          "and"           
 [241] "Andersen"       "have"           "brought"        "more"          
 [245] "happiness"      "to"             "childish"       "hearts"        
 [249] "than"           "all"            "other"          "human"         
 [253] "creations."     ""               "Yet"            "the"           
 [257] "old"            "time"           "fairy"          "tale,"         
 [261] "having"         "served"         "for"            "generations,"  
 [265] "may"            "now"            "be"             "classed"       
 [269] "as"             "\"historical\"" "in"             "the"           
 [273] "children's"     "library;"       "for"            "the"           
 [277] "time"           "has"            "come"           "for"           
 [281] "a"              "series"         "of"             "newer"         
 [285] "\"wonder"       "tales\""        "in"             "which"         
 [289] "the"            "stereotyped"    "genie,"         "dwarf"         
 [293] "and"            "fairy"          "are"            "eliminated,"   
 [297] "together"       "with"           "all"            "the"           
 [301] "horrible"       "and"            "blood-curdling" "incidents"     
 [305] "devised"        "by"             "their"          "authors"       
 [309] "to"             "point"          "a"              "fearsome"      
 [313] "moral"          "to"             "each"           "tale."         
 [317] "Modern"         "education"      "includes"       "morality;"     
 [321] "therefore"      "the"            "modern"         "child"         
 [325] "seeks"          "only"           "entertainment"  "in"            
 [329] "its"            "wonder"         "tales"          "and"           
 [333] "gladly"         "dispenses"      "with"           "all"           
 [337] "disagreeable"   "incident."      ""               "Having"        
 [341] "this"           "thought"        "in"             "mind,"         
 [345] "the"            "story"          "of"             "\"The"         
 [349] "Wonderful"      "Wizard"         "of"             "Oz\""          
 [353] "was"            "written"        "solely"         "to"            
 [357] "please"         "children"       "of"             "today."        
 [361] "It"             "aspires"        "to"             "being"         
 [365] "a"              "modernized"     "fairy"          "tale,"         
 [369] "in"             "which"          "the"            "wonderment"    
 [373] "and"            "joy"            "are"            "retained"      
 [377] "and"            "the"            "heartaches"     "and"           
 [381] "nightmares"     "are"            "left"           "out."          
 [385] ""               ""               "L."             "Frank"         
 [389] "Baum"           ""               "Chicago,"       "April,"        
 [393] "1900."          ""               ""               ""              
 [397] ""               "THE"            "WONDERFUL"      "WIZARD"        
 [401] "OF"             "OZ"             ""               ""              
 [405] ""               ""               "1."             "The"           
 [409] "Cyclone"        ""               ""               "Dorothy"       
 [413] "lived"          "in"             "the"            "midst"         
 [417] "of"             "the"            "great"          "Kansas"        
 [421] "prairies,"      "with"           "Uncle"          "Henry,"        
 [425] "who"            "was"            "a"              "farmer,"       
 [429] "and"            "Aunt"           "Em,"            "who"           
 [433] "was"            "the"            "farmer's"       "wife."         
 [437] "Their"          "house"          "was"            "small,"        
 [441] "for"            "the"            "lumber"         "to"            
 [445] "build"          "it"             "had"            "to"            
 [449] "be"             "carried"        "by"             "wagon"         
 [453] "many"           "miles."         "There"          "were"          
 [457] "four"           "walls,"         "a"              "floor"         
 [461] "and"            "a"              "roof,"          "which"         
 [465] "made"           "one"            "room;"          "and"           
 [469] "this"           "room"           "contained"      "a"             
 [473] "rusty"          "looking"        "cookstove,"     "a"             
 [477] "cupboard"       "for"            "the"            "dishes,"       
 [481] "a"              "table,"         "three"          "or"            
 [485] "four"           "chairs,"        "and"            "the"           
 [489] "beds."          "Uncle"          "Henry"          "and"           
 [493] "Aunt"           "Em"             "had"            "a"             
 [497] "big"            "bed"            "in"             "one"           
 [501] "corner,"        "and"            "Dorothy"        "a"             
 [505] "little"         "bed"            "in"             "another"       
 [509] "corner."        "There"          "was"            "no"            
 [513] "garret"         "at"             "all,"           "and"           
 [517] "no"             "cellar--except" "a"              "small"         
 [521] "hole"           "dug"            "in"             "the"           
 [525] "ground,"        "called"         "a"              "cyclone"       
 [529] "cellar,"        "where"          "the"            "family"        
 [533] "could"          "go"             "in"             "case"          
 [537] "one"            "of"             "those"          "great"         
 [541] "whirlwinds"     "arose,"         "mighty"         "enough"        
 [545] "to"             "crush"          "any"            "building"      
 [549] "in"             "its"            "path."          "It"            
 [553] "was"            "reached"        "by"             "a"             
 [557] "trap"           "door"           "in"             "the"           
 [561] "middle"         "of"             "the"            "floor,"        
 [565] "from"           "which"          "a"              "ladder"        
 [569] "led"            "down"           "into"           "the"           
 [573] "small,"         "dark"           "hole."          ""              
 [577] "When"           "Dorothy"        "stood"          "in"            
 [581] "the"            "doorway"        "and"            "looked"        
 [585] "around,"        "she"            "could"          "see"           
 [589] "nothing"        "but"            "the"            "great"         
 [593] "gray"           "prairie"        "on"             "every"         
 [597] "side."          "Not"            "a"              "tree"          
 [601] "nor"            "a"              "house"          "broke"         
 [605] "the"            "broad"          "sweep"          "of"            
 [609] "flat"           "country"        "that"           "reached"       
 [613] "to"             "the"            "edge"           "of"            
 [617] "the"            "sky"            "in"             "all"           
 [621] "directions."    "The"            "sun"            "had"           
 [625] "baked"          "the"            "plowed"         "land"          
 [629] "into"           "a"              "gray"           "mass,"         
 [633] "with"           "little"         "cracks"         "running"       
 [637] "through"        "it."            "Even"           "the"           
 [641] "grass"          "was"            "not"            "green,"        
 [645] "for"            "the"            "sun"            "had"           
 [649] "burned"         "the"            "tops"           "of"            
 [653] "the"            "long"           "blades"         "until"         
 [657] "they"           "were"           "the"            "same"          
 [661] "gray"           "color"          "to"             "be"            
 [665] "seen"           "everywhere."    "Once"           "the"           
 [669] "house"          "had"            "been"           "painted,"      
 [673] "but"            "the"            "sun"            "blistered"     
 [677] "the"            "paint"          "and"            "the"           
 [681] "rains"          "washed"         "it"             "away,"         
 [685] "and"            "now"            "the"            "house"         
 [689] "was"            "as"             "dull"           "and"           
 [693] "gray"           "as"             "everything"     "else."         
 [697] ""               "When"           "Aunt"           "Em"            
 [701] "came"           "there"          "to"             "live"          
 [705] "she"            "was"            "a"              "young,"        
 [709] "pretty"         "wife."          "The"            "sun"           
 [713] "and"            "wind"           "had"            "changed"       
 [717] "her,"           "too."           "They"           "had"           
 [721] "taken"          "the"            "sparkle"        "from"          
 [725] "her"            "eyes"           "and"            "left"          
 [729] "them"           "a"              "sober"          "gray;"         
 [733] "they"           "had"            "taken"          "the"           
 [737] "red"            "from"           "her"            "cheeks"        
 [741] "and"            "lips,"          "and"            "they"          
 [745] "were"           "gray"           "also."          "She"           
 [749] "was"            "thin"           "and"            "gaunt,"        
 [753] "and"            "never"          "smiled"         "now."          
 [757] "When"           "Dorothy,"       "who"            "was"           
 [761] "an"             "orphan,"        "first"          "came"          
 [765] "to"             "her,"           "Aunt"           "Em"            
 [769] "had"            "been"           "so"             "startled"      
 [773] "by"             "the"            "child's"        "laughter"      
 [777] "that"           "she"            "would"          "scream"        
 [781] "and"            "press"          "her"            "hand"          
 [785] "upon"           "her"            "heart"          "whenever"      
 [789] "Dorothy's"      "merry"          "voice"          "reached"       
 [793] "her"            "ears;"          "and"            "she"           
 [797] "still"          "looked"         "at"             "the"           
 [801] "little"         "girl"           "with"           "wonder"        
 [805] "that"           "she"            "could"          "find"          
 [809] "anything"       "to"             "laugh"          "at."           
 [813] ""               "Uncle"          "Henry"          "never"         
 [817] "laughed."       "He"             "worked"         "hard"          
 [821] "from"           "morning"        "till"           "night"         
 [825] "and"            "did"            "not"            "know"          
 [829] "what"           "joy"            "was."           "He"            
 [833] "was"            "gray"           "also,"          "from"          
 [837] "his"            "long"           "beard"          "to"            
 [841] "his"            "rough"          "boots,"         "and"           
 [845] "he"             "looked"         "stern"          "and"           
 [849] "solemn,"        "and"            "rarely"         "spoke."        
 [853] ""               "It"             "was"            "Toto"          
 [857] "that"           "made"           "Dorothy"        "laugh,"        
 [861] "and"            "saved"          "her"            "from"          
 [865] "growing"        "as"             "gray"           "as"            
 [869] "her"            "other"          "surroundings."  "Toto"          
 [873] "was"            "not"            "gray;"          "he"            
 [877] "was"            "a"              "little"         "black"         
 [881] "dog,"           "with"           "long"           "silky"         
 [885] "hair"           "and"            "small"          "black"         
 [889] "eyes"           "that"           "twinkled"       "merrily"       
 [893] "on"             "either"         "side"           "of"            
 [897] "his"            "funny,"         "wee"            "nose."         
 [901] "Toto"           "played"         "all"            "day"           
 [905] "long,"          "and"            "Dorothy"        "played"        
 [909] "with"           "him,"           "and"            "loved"         
 [913] "him"            "dearly."        ""               "Today,"        
 [917] "however,"       "they"           "were"           "not"           
 [921] "playing."       "Uncle"          "Henry"          "sat"           
 [925] "upon"           "the"            "doorstep"       "and"           
 [929] "looked"         "anxiously"      "at"             "the"           
 [933] "sky,"           "which"          "was"            "even"          
 [937] "grayer"         "than"           "usual."         "Dorothy"       
 [941] "stood"          "in"             "the"            "door"          
 [945] "with"           "Toto"           "in"             "her"           
 [949] "arms,"          "and"            "looked"         "at"            
 [953] "the"            "sky"            "too."           "Aunt"          
 [957] "Em"             "was"            "washing"        "the"           
 [961] "dishes."        ""               "From"           "the"           
 [965] "far"            "north"          "they"           "heard"         
 [969] "a"              "low"            "wail"           "of"            
 [973] "the"            "wind,"          "and"            "Uncle"         
 [977] "Henry"          "and"            "Dorothy"        "could"         
 [981] "see"            "where"          "the"            "long"          
 [985] "grass"          "bowed"          "in"             "waves"         
 [989] "before"         "the"            "coming"         "storm."        
 [993] "There"          "now"            "came"           "a"             
 [997] "sharp"          "whistling"      "in"             "the"           

Refine

Put to lowercase, and each word need only appear once:

ozWords <- 
  ozwds %>% 
  str_to_lower() %>% 
  unique() %>% 
  str_sort()
ozWords[1:1000]
   [1] ""                     "'and"                 "'he"                 
   [4] "'how"                 "'if"                  "\"'blue"             
   [7] "\"'i"                 "\"'never"             "\"'now"              
  [10] "\"'swim"              "\"'that's"            "\"'they"             
  [13] "\"'this"              "\"'why,"              "\"a"                 
  [16] "\"after"              "\"ah,\""              "\"all"               
  [19] "\"although"           "\"am"                 "\"and"               
  [22] "\"anyone"             "\"are"                "\"aren't"            
  [25] "\"at"                 "\"aunt"               "\"be"                
  [28] "\"because"            "\"because,\""         "\"before"            
  [31] "\"bless"              "\"blue"               "\"bring"             
  [34] "\"built"              "\"but"                "\"but,"              
  [37] "\"but,\""             "\"by"                 "\"call"              
  [40] "\"can"                "\"can't"              "\"carry"             
  [43] "\"certainly,\""       "\"certainly;"         "\"certainly."        
  [46] "\"city"               "\"come"               "\"come,"             
  [49] "\"congratulate"       "\"dear"               "\"did"               
  [52] "\"didn't"             "\"do"                 "\"does"              
  [55] "\"doesn't"            "\"don't"              "\"don't!"            
  [58] "\"drink.\""           "\"ep-pe,"             "\"even"              
  [61] "\"everything"         "\"exactly"            "\"fly"               
  [64] "\"follow"             "\"for"                "\"for,"              
  [67] "\"for,\""             "\"from"               "\"full"              
  [70] "\"get"                "\"give"               "\"glinda"            
  [73] "\"go"                 "\"good"               "\"good-bye,"         
  [76] "\"good-bye,\""        "\"good-bye!\""        "\"have"              
  [79] "\"he"                 "\"he's"               "\"hello!\""          
  [82] "\"help"               "\"here"               "\"hereafter"         
  [85] "\"hil-lo,"            "\"historical\""       "\"how"               
  [88] "\"how,"               "\"how?\""             "\"hurry"             
  [91] "\"hush,"              "\"i"                  "\"i'll"              
  [94] "\"i'm"                "\"i've"               "\"if"                
  [97] "\"in"                 "\"indeed,"            "\"is"                
 [100] "\"isn't"              "\"it"                 "\"it's"              
 [103] "\"just"               "\"keep"               "\"kill"              
 [106] "\"killed"             "\"let"                "\"lies"              
 [109] "\"look!\""            "\"make"               "\"making"            
 [112] "\"melted!"            "\"my"                 "\"neither."          
 [115] "\"never"              "\"never."             "\"no"                
 [118] "\"no,"                "\"no,\""              "\"no;"               
 [121] "\"no."                "\"nor"                "\"not"               
 [124] "\"nothing"            "\"now"                "\"now,"              
 [127] "\"now,\""             "\"of"                 "\"oh,"               
 [130] "\"oh,\""              "\"oh!"                "\"oil"               
 [133] "\"once,\""            "\"one"                "\"one,"              
 [136] "\"only"               "\"or"                 "\"over"              
 [139] "\"oz"                 "\"perhaps"            "\"perhaps,\""        
 [142] "\"permit"             "\"please"             "\"quelala"           
 [145] "\"quick,"             "\"quick!\""           "\"really,\""         
 [148] "\"really?\""          "\"rest"               "\"run"               
 [151] "\"see"                "\"send"               "\"shall"             
 [154] "\"she"                "\"since"              "\"sit"               
 [157] "\"so"                 "\"so,"                "\"something"         
 [160] "\"step"               "\"suppose"            "\"take"              
 [163] "\"tell"               "\"thank"              "\"that"              
 [166] "\"that's"             "\"the"                "\"then"              
 [169] "\"then,"              "\"there"              "\"there,\""          
 [172] "\"there!\""           "\"there's"            "\"therefore"         
 [175] "\"they"               "\"this"               "\"to"                
 [178] "\"tol-de-ri-de-oh!\"" "\"true,\""            "\"truly"             
 [181] "\"very"               "\"wait"               "\"we"                
 [184] "\"we!\""              "\"we're"              "\"welcome,"          
 [187] "\"well,"              "\"well,\""            "\"what"              
 [190] "\"what!"              "\"when"               "\"where"             
 [193] "\"which"              "\"who"                "\"why"               
 [196] "\"why,"               "\"why?\""             "\"will"              
 [199] "\"willingly!\""       "\"wishes"             "\"with"              
 [202] "\"won't"              "\"wonder"             "\"yes,"              
 [205] "\"yes,\""             "\"yonder,"            "\"you"               
 [208] "\"you're"             "\"you've"             "\"your"              
 [211] "\"ziz-zy,"            "(which"               "1."                  
 [214] "10."                  "11."                  "12."                 
 [217] "13."                  "14."                  "15."                 
 [220] "16."                  "17."                  "18."                 
 [223] "19."                  "1900."                "2."                  
 [226] "20."                  "21."                  "22."                 
 [229] "23."                  "24."                  "3."                  
 [232] "4."                   "5."                   "6."                  
 [235] "7."                   "8."                   "9."                  
 [238] "a"                    "a--a--a"              "able"                
 [241] "able,\""              "able."                "abounding"           
 [244] "about"                "about,"               "about;"              
 [247] "about.\""             "above"                "abundance."          
 [250] "accident"             "accidents"            "account"             
 [253] "accounts"             "ached."               "across"              
 [256] "across,"              "act"                  "act."                
 [259] "action"               "actually"             "add"                 
 [262] "added"                "added,"               "addition"            
 [265] "admit"                "admit.\""             "admitted"            
 [268] "advanced"             "advanced,"            "adventure"           
 [271] "adventures"           "advice.\""            "advise"              
 [274] "afford"               "afraid"               "afraid,"             
 [277] "afraid,\""            "afraid."              "afraid.\""           
 [280] "afresh"               "after"                "after,"              
 [283] "after."               "afternoon"            "afternoon,"          
 [286] "afterward"            "afterwards"           "again"               
 [289] "again--or"            "again,"               "again,\""            
 [292] "again;"               "again!\""             "again?\""            
 [295] "again."               "again.\""             "against"             
 [298] "age,"                 "ages,"                "ago"                 
 [301] "ago,"                 "ago.\""               "agree"               
 [304] "agreed"               "ahead,"               "air"                 
 [307] "air,"                 "air;"                 "air."                
 [310] "airtight,"            "alarm"                "alas!"               
 [313] "alive"                "alive,"               "alive."              
 [316] "alive.\""             "all"                  "all,"                
 [319] "all,\""               "all;"                 "all!\""              
 [322] "all."                 "all.\""               "allow"               
 [325] "allowed"              "almost"               "alone"               
 [328] "alone,"               "alone."               "alone.\""            
 [331] "along"                "along,"               "along,\""            
 [334] "along."               "already"              "already,\""          
 [337] "already.\""           "also"                 "also,"               
 [340] "also,\""              "also;"                "also?\""             
 [343] "also."                "although"             "altogether,"         
 [346] "always"               "am"                   "am--i"               
 [349] "am,\""                "am.\""                "amazement"           
 [352] "amazement,"           "among"                "amongst"             
 [355] "amuse"                "amused"               "an"                  
 [358] "and"                  "and,"                 "andersen"            
 [361] "angered"              "angrier"              "angrily."            
 [364] "angry"                "angry,"               "angry."              
 [367] "animal"               "animal,"              "animals"             
 [370] "animals."             "announced"            "another"             
 [373] "another,"             "another:"             "answer"              
 [376] "answer,"              "answer;"              "answer."             
 [379] "answered"             "answered,"            "answered:"           
 [382] "answered."            "ant"                  "anxious"             
 [385] "anxious,\""           "anxiously"            "anxiously,"          
 [388] "anxiously."           "any"                  "any?\""              
 [391] "anybody"              "anybody,"             "anyone"              
 [394] "anyone,"              "anyone."              "anything"            
 [397] "anything,"            "anything;"            "anything?\""         
 [400] "anything."            "anything.\""          "anyway,\""           
 [403] "anywhere"             "anywhere,\""          "appeared"            
 [406] "appears"              "approach"             "approached,"         
 [409] "approve"              "april,"               "apron"               
 [412] "apron,"               "apron."               "arched"              
 [415] "are"                  "are,"                 "are,\""              
 [418] "are.\""               "aren't"               "arm"                 
 [421] "armed"                "armless"              "arms"                
 [424] "arms,"                "arms,\""              "arms."               
 [427] "army"                 "arose"                "arose,"              
 [430] "around"               "around,"              "around."             
 [433] "aroused"              "arrive"               "art"                 
 [436] "arts"                 "as"                   "ashamed"             
 [439] "ask"                  "ask."                 "asked"               
 [442] "asked,"               "asked:"               "asked."              
 [445] "asleep"               "asleep,"              "asleep."             
 [448] "aspires"              "assemblage"           "assistance."         
 [451] "assorted"             "astonished"           "astonished;"         
 [454] "at"                   "at."                  "ate"                 
 [457] "attack"               "attacked"             "attempt.\""          
 [460] "attended"             "audience,"            "aunt"                
 [463] "aunt."                "authors"              "awake"               
 [466] "awake,"               "awaken;"              "awakened"            
 [469] "awakened,"            "away"                 "away,"               
 [472] "away."                "away.\""              "awful"               
 [475] "awfully"              "awkward"              "awoke"               
 [478] "axe"                  "axe-handle"           "axe,"                
 [481] "babies"               "baby"                 "baby,"               
 [484] "back"                 "back,"                "back!"               
 [487] "back!\""              "back."                "backs"               
 [490] "backward,"            "bad"                  "bad,\""              
 [493] "bad!\""               "bade"                 "badly"               
 [496] "badly,"               "badly.\""             "bag"                 
 [499] "bag."                 "baked"                "balanced"            
 [502] "bald"                 "ball"                 "ball."               
 [505] "balloon"              "balloon,\""           "balloon."            
 [508] "balloon.\""           "balloonist.\""        "balls"               
 [511] "band"                 "band."                "bands"               
 [514] "bank"                 "bank,"                "banks"               
 [517] "banks,"               "bar"                  "bar,"                
 [520] "bark"                 "bark,"                "barked"              
 [523] "barked."              "barking"              "barn,"               
 [526] "barns,"               "barnyard,"            "bars"                
 [529] "basin."               "basket"               "basket,"             
 [532] "basket."              "basket.\""            "baskets,"            
 [535] "bath."                "bathed"               "bathing."            
 [538] "battered"             "battle"               "battle,"             
 [541] "baum"                 "be"                   "be,"                 
 [544] "be,\""                "be."                  "be.\""               
 [547] "beam"                 "bear"                 "beard"               
 [550] "beard."               "beards."              "bearing"             
 [553] "bears"                "beast"                "beast,"              
 [556] "beast,\""             "beast."               "beast.\""            
 [559] "beast's"              "beasts"               "beasts,"             
 [562] "beasts!"              "beasts."              "beasts.\""           
 [565] "beat"                 "beat.\""              "beaten"              
 [568] "beating"              "beautiful"            "beautiful,"          
 [571] "beautiful,\""         "beautiful!"           "beautiful?\""        
 [574] "beautiful."           "beautifully"          "beauty"              
 [577] "beauty?\""            "beauty."              "became"              
 [580] "became."              "because"              "beckon"              
 [583] "become"               "becomes"              "bed"                 
 [586] "bed,"                 "bed."                 "bed.\""              
 [589] "beds."                "been"                 "bees"                
 [592] "bees,"                "beetle"               "before"              
 [595] "before,"              "before;"              "before?\""           
 [598] "before."              "before.\""            "beg"                 
 [601] "began"                "begged"               "begin"               
 [604] "beginning"            "begins"               "begun"               
 [607] "behind"               "behind,"              "behind."             
 [610] "behold!"              "being"                "believe"             
 [613] "believe!\""           "believe.\""           "bell"                
 [616] "bell."                "bells"                "belong"              
 [619] "belonged"             "belongs"              "below"               
 [622] "below."               "bend"                 "bending"             
 [625] "bent"                 "bent."                "beside"              
 [628] "besides"              "besides,"             "best"                
 [631] "best,"                "bestow"               "better"              
 [634] "better."              "better.\""            "between"             
 [637] "between,"             "bewilderment."        "beyond"              
 [640] "bidding"              "bidding."             "big"                 
 [643] "big,"                 "big;"                 "bigger"              
 [646] "bigger,'\""           "biggest"              "bird"                
 [649] "bird;"                "birds"                "birds,"              
 [652] "bit"                  "bit,\""               "bit."                
 [655] "bite"                 "bites.\""             "biting"              
 [658] "bitten"               "bitten,"              "bitterly"            
 [661] "black"                "blade"                "blades"              
 [664] "bleed"                "blew"                 "blind"               
 [667] "blinded"              "blistered"            "block"               
 [670] "blocks"               "blood"                "blood-curdling"      
 [673] "blossoms,"            "blow"                 "blue"                
 [676] "blue,"                "blue;"                "blunted"             
 [679] "boards"               "bodices"              "bodies"              
 [682] "body"                 "body,"                "body."               
 [685] "bogs"                 "boisterous"           "boldly"              
 [688] "bondage"              "bondage."             "bondage.\""          
 [691] "books"                "books."               "boots"               
 [694] "boots,"               "boq"                  "boq."                
 [697] "bordered"             "born"                 "both"                
 [700] "both,"                "bother"               "bottle,"             
 [703] "bottom"               "bottom."              "bought"              
 [706] "bounded"              "bounding"             "bow"                 
 [709] "bow,"                 "bow."                 "bowed"               
 [712] "bowed,"               "bowing"               "bowl"                
 [715] "box"                  "box,"                 "box."                
 [718] "boy"                  "boy,"                 "bracelet"            
 [721] "braid;"               "brain"                "brains"              
 [724] "brains,"              "brains,\""            "brains!\""           
 [727] "brains?\""            "brains."              "brains.\""           
 [730] "brains\""             "bran-new"             "bran,"               
 [733] "branch"               "branches"             "branches,"           
 [736] "branches."            "brave"                "brave,"              
 [739] "bravely"              "bravely."             "braver,"             
 [742] "braver,\""            "bread"                "bread,"              
 [745] "bread."               "break"                "breakfast,"          
 [748] "breakfast."           "breakfast.\""         "breakfasted"         
 [751] "breaking"             "breast"               "breast,"             
 [754] "breast;"              "breast."              "breath"              
 [757] "breathe"              "breathed"             "breathes"            
 [760] "breathing"            "breeches"             "breeze"              
 [763] "brick"                "brick,"               "brick,\""            
 [766] "brick."               "bricks,"              "bricks."             
 [769] "bride"                "bridge"               "bridges"             
 [772] "bright"               "brighter"             "brighter,"           
 [775] "brightest"            "brightly"             "brightly."           
 [778] "brightness"           "brilliancy"           "brilliancy."         
 [781] "brilliant"            "brims"                "bring"               
 [784] "bring."               "bringing"             "brings"              
 [787] "brisk"                "briskly"              "brittle!\""          
 [790] "broad"                "brocaded"             "broke"               
 [793] "broken"               "broken?\""            "brook"               
 [796] "brook,"               "brooks"               "brother"             
 [799] "brought"              "brought."             "brown"               
 [802] "brown,"               "brownie,"             "bruised"             
 [805] "brutes"               "bucket"               "buckles"             
 [808] "bug"                  "build"                "building"            
 [811] "building,"            "built"                "built,"              
 [814] "bulged"               "bundle"               "burn"                
 [817] "burned"               "burning"              "burnished"           
 [820] "burst."               "bushes."              "business"            
 [823] "busy"                 "busy,"                "but"                 
 [826] "but,"                 "butter."              "buttercups"          
 [829] "buttercups."          "butterflies,"         "butterfly"           
 [832] "button"               "buzzing"              "by"                  
 [835] "by,"                  "by;"                  "by."                 
 [838] "cabbages"             "cabinets"             "cackling"            
 [841] "cake"                 "cakes,"               "call"                
 [844] "call,"                "called"               "called,"             
 [847] "calling"              "calmly"               "calmly."             
 [850] "came"                 "came,"                "came."               
 [853] "camp"                 "can"                  "can,\""              
 [856] "can."                 "can't"                "candy"               
 [859] "cannon"               "cannot"               "cannot!\""           
 [862] "cannot?\""            "cap"                  "cap,"                
 [865] "cap,\""               "cap;"                 "cap!\""              
 [868] "cap?\""               "cap."                 "cap.\""              
 [871] "caps."                "care"                 "care;"               
 [874] "cared"                "careful"              "careful!\""          
 [877] "careful."             "carefully"            "carefully,"          
 [880] "carefully."           "careless"             "carpet"              
 [883] "carpeted"             "carried"              "carried."            
 [886] "carried.\""           "carries"              "carry"               
 [889] "carrying"             "carts,"               "carved"              
 [892] "carved."              "case"                 "cast"                
 [895] "castle"               "castle,"              "castle?\""           
 [898] "castle."              "cat"                  "cat."                
 [901] "catch"                "caught"               "caused"              
 [904] "ceiling"              "ceiling."             "celebrate"           
 [907] "cellar--except"       "cellar,"              "cellar!\""           
 [910] "center"               "certain."             "certainly"           
 [913] "chain"                "chair"                "chair,"              
 [916] "chairs,"              "chairs;"              "chalk"               
 [919] "chamber"              "chance"               "change"              
 [922] "changed"              "chariot"              "charm"               
 [925] "charm,"               "charm,\""             "charm."              
 [928] "chase"                "chased"               "chasing"             
 [931] "chatted"              "chattering"           "checked,\""          
 [934] "checks"               "cheeks"               "cheer"               
 [937] "cheer,"               "cheerful"             "cheerfully,"         
 [940] "cheerfully."          "cheers"               "chest"               
 [943] "chicago,"             "chickens,"            "child"               
 [946] "child,"               "child!\""             "child?\""            
 [949] "child."               "child's"              "childhood"           
 [952] "childish"             "children"             "children--walking"   
 [955] "children's"           "chin"                 "china"               
 [958] "china,"               "china."               "chop"                
 [961] "chop,"                "chopped"              "chopping"            
 [964] "chorus"               "chorus."              "chose"               
 [967] "chose,"               "chubby"               "church"              
 [970] "church."              "circle"               "circus"              
 [973] "circus,\""            "city"                 "city,"               
 [976] "city,\""              "city;"                "city?\""             
 [979] "city."                "city.\""              "civilized"           
 [982] "civilized,"           "claim"                "clapped"             
 [985] "clapping"             "clasping"             "classed"             
 [988] "clatter."             "claws"                "claws,"              
 [991] "claws,\""             "claws."               "clean"               
 [994] "cleaned"              "clear"                "clear,"              
 [997] "cleared"              "climb"                "climb,\""            
[1000] "climbed"             

Issues with ozWords

  • There are some numbers. We’ll get rid of them.
  • There are an awful lot of strings that begin or end with punctuation. This should not be difficult to remove with str_replace().
  • There are plenty of contractions, like "aren't". It seems reasonable to count these as words, so we’ll leave them alone.
  • There are hyphenated words. What should we do with them?

Some Hyphenated Words in Oz

So the Wicked Witch took the Golden Cap from her cupboard and placed it upon her head. Then she stood upon her left foot and said slowly:

“Ep-pe, pep-pe, kak-ke!”

Next she stood upon her right foot and said:

“Hil-lo, hol-lo, hel-lo!”

After this she stood upon both feet and cried in a loud voice:

“Ziz-zy, zuz-zy, zik!”

Pairs of Hyphens

On the other hand, we often find pairs of hyphens that appear to act as the emdash character (—):

The cyclone had set the house down very gently---for a cyclone--in the midst of a country of marvelous beauty.

To prevent these from coming in as single words, we need to split on double-hyphens.

A Second Attempt

ozwds <-
  oz2 %>% 
  str_split(
    pattern = "(?x)    # allow comments
              (-{2,})  # two or more hyphens
              |        # or
              (\\s+)   # whitespace
              "
    ) %>% 
  unlist()
ozwds[1:1000]
   [1] ""               ""               ""               ""              
   [5] ""               ""               ""               ""              
   [9] ""               ""               ""               ""              
  [13] ""               ""               ""               "The"           
  [17] "Wonderful"      "Wizard"         "of"             "Oz"            
  [21] ""               ""               "by"             ""              
  [25] "L."             "Frank"          "Baum"           ""              
  [29] ""               ""               ""               "Contents"      
  [33] ""               ""               "Introduction"   ""              
  [37] "1."             "The"            "Cyclone"        ""              
  [41] "2."             "The"            "Council"        "with"          
  [45] "the"            "Munchkins"      ""               "3."            
  [49] "How"            "Dorothy"        "Saved"          "the"           
  [53] "Scarecrow"      ""               "4."             "The"           
  [57] "Road"           "Through"        "the"            "Forest"        
  [61] ""               "5."             "The"            "Rescue"        
  [65] "of"             "the"            "Tin"            "Woodman"       
  [69] ""               "6."             "The"            "Cowardly"      
  [73] "Lion"           ""               "7."             "The"           
  [77] "Journey"        "to"             "the"            "Great"         
  [81] "Oz"             ""               "8."             "The"           
  [85] "Deadly"         "Poppy"          "Field"          ""              
  [89] "9."             "The"            "Queen"          "of"            
  [93] "the"            "Field"          "Mice"           ""              
  [97] "10."            "The"            "Guardian"       "of"            
 [101] "the"            "Gates"          ""               "11."           
 [105] "The"            "Emerald"        "City"           "of"            
 [109] "Oz"             ""               "12."            "The"           
 [113] "Search"         "for"            "the"            "Wicked"        
 [117] "Witch"          ""               "13."            "The"           
 [121] "Rescue"         ""               "14."            "The"           
 [125] "Winged"         "Monkeys"        ""               "15."           
 [129] "The"            "Discovery"      "of"             "Oz"            
 [133] "the"            "Terrible"       ""               "16."           
 [137] "The"            "Magic"          "Art"            "of"            
 [141] "the"            "Great"          "Humbug"         ""              
 [145] "17."            "How"            "the"            "Balloon"       
 [149] "Was"            "Launched"       ""               "18."           
 [153] "Away"           "to"             "the"            "South"         
 [157] ""               "19."            "Attacked"       "by"            
 [161] "the"            "Fighting"       "Trees"          ""              
 [165] "20."            "The"            "Dainty"         "China"         
 [169] "Country"        ""               "21."            "The"           
 [173] "Lion"           "Becomes"        "the"            "King"          
 [177] "of"             "Beasts"         ""               "22."           
 [181] "The"            "Country"        "of"             "the"           
 [185] "Quadlings"      ""               "23."            "Glinda"        
 [189] "The"            "Good"           "Witch"          "Grants"        
 [193] "Dorothy's"      "Wish"           ""               "24."           
 [197] "Home"           "Again"          ""               ""              
 [201] ""               ""               "Introduction"   ""              
 [205] ""               "Folklore,"      "legends,"       "myths"         
 [209] "and"            "fairy"          "tales"          "have"          
 [213] "followed"       "childhood"      "through"        "the"           
 [217] "ages,"          "for"            "every"          "healthy"       
 [221] "youngster"      "has"            "a"              "wholesome"     
 [225] "and"            "instinctive"    "love"           "for"           
 [229] "stories"        "fantastic,"     "marvelous"      "and"           
 [233] "manifestly"     "unreal."        "The"            "winged"        
 [237] "fairies"        "of"             "Grimm"          "and"           
 [241] "Andersen"       "have"           "brought"        "more"          
 [245] "happiness"      "to"             "childish"       "hearts"        
 [249] "than"           "all"            "other"          "human"         
 [253] "creations."     ""               "Yet"            "the"           
 [257] "old"            "time"           "fairy"          "tale,"         
 [261] "having"         "served"         "for"            "generations,"  
 [265] "may"            "now"            "be"             "classed"       
 [269] "as"             "\"historical\"" "in"             "the"           
 [273] "children's"     "library;"       "for"            "the"           
 [277] "time"           "has"            "come"           "for"           
 [281] "a"              "series"         "of"             "newer"         
 [285] "\"wonder"       "tales\""        "in"             "which"         
 [289] "the"            "stereotyped"    "genie,"         "dwarf"         
 [293] "and"            "fairy"          "are"            "eliminated,"   
 [297] "together"       "with"           "all"            "the"           
 [301] "horrible"       "and"            "blood-curdling" "incidents"     
 [305] "devised"        "by"             "their"          "authors"       
 [309] "to"             "point"          "a"              "fearsome"      
 [313] "moral"          "to"             "each"           "tale."         
 [317] "Modern"         "education"      "includes"       "morality;"     
 [321] "therefore"      "the"            "modern"         "child"         
 [325] "seeks"          "only"           "entertainment"  "in"            
 [329] "its"            "wonder"         "tales"          "and"           
 [333] "gladly"         "dispenses"      "with"           "all"           
 [337] "disagreeable"   "incident."      ""               "Having"        
 [341] "this"           "thought"        "in"             "mind,"         
 [345] "the"            "story"          "of"             "\"The"         
 [349] "Wonderful"      "Wizard"         "of"             "Oz\""          
 [353] "was"            "written"        "solely"         "to"            
 [357] "please"         "children"       "of"             "today."        
 [361] "It"             "aspires"        "to"             "being"         
 [365] "a"              "modernized"     "fairy"          "tale,"         
 [369] "in"             "which"          "the"            "wonderment"    
 [373] "and"            "joy"            "are"            "retained"      
 [377] "and"            "the"            "heartaches"     "and"           
 [381] "nightmares"     "are"            "left"           "out."          
 [385] ""               ""               "L."             "Frank"         
 [389] "Baum"           ""               "Chicago,"       "April,"        
 [393] "1900."          ""               ""               ""              
 [397] ""               "THE"            "WONDERFUL"      "WIZARD"        
 [401] "OF"             "OZ"             ""               ""              
 [405] ""               ""               "1."             "The"           
 [409] "Cyclone"        ""               ""               "Dorothy"       
 [413] "lived"          "in"             "the"            "midst"         
 [417] "of"             "the"            "great"          "Kansas"        
 [421] "prairies,"      "with"           "Uncle"          "Henry,"        
 [425] "who"            "was"            "a"              "farmer,"       
 [429] "and"            "Aunt"           "Em,"            "who"           
 [433] "was"            "the"            "farmer's"       "wife."         
 [437] "Their"          "house"          "was"            "small,"        
 [441] "for"            "the"            "lumber"         "to"            
 [445] "build"          "it"             "had"            "to"            
 [449] "be"             "carried"        "by"             "wagon"         
 [453] "many"           "miles."         "There"          "were"          
 [457] "four"           "walls,"         "a"              "floor"         
 [461] "and"            "a"              "roof,"          "which"         
 [465] "made"           "one"            "room;"          "and"           
 [469] "this"           "room"           "contained"      "a"             
 [473] "rusty"          "looking"        "cookstove,"     "a"             
 [477] "cupboard"       "for"            "the"            "dishes,"       
 [481] "a"              "table,"         "three"          "or"            
 [485] "four"           "chairs,"        "and"            "the"           
 [489] "beds."          "Uncle"          "Henry"          "and"           
 [493] "Aunt"           "Em"             "had"            "a"             
 [497] "big"            "bed"            "in"             "one"           
 [501] "corner,"        "and"            "Dorothy"        "a"             
 [505] "little"         "bed"            "in"             "another"       
 [509] "corner."        "There"          "was"            "no"            
 [513] "garret"         "at"             "all,"           "and"           
 [517] "no"             "cellar"         "except"         "a"             
 [521] "small"          "hole"           "dug"            "in"            
 [525] "the"            "ground,"        "called"         "a"             
 [529] "cyclone"        "cellar,"        "where"          "the"           
 [533] "family"         "could"          "go"             "in"            
 [537] "case"           "one"            "of"             "those"         
 [541] "great"          "whirlwinds"     "arose,"         "mighty"        
 [545] "enough"         "to"             "crush"          "any"           
 [549] "building"       "in"             "its"            "path."         
 [553] "It"             "was"            "reached"        "by"            
 [557] "a"              "trap"           "door"           "in"            
 [561] "the"            "middle"         "of"             "the"           
 [565] "floor,"         "from"           "which"          "a"             
 [569] "ladder"         "led"            "down"           "into"          
 [573] "the"            "small,"         "dark"           "hole."         
 [577] ""               "When"           "Dorothy"        "stood"         
 [581] "in"             "the"            "doorway"        "and"           
 [585] "looked"         "around,"        "she"            "could"         
 [589] "see"            "nothing"        "but"            "the"           
 [593] "great"          "gray"           "prairie"        "on"            
 [597] "every"          "side."          "Not"            "a"             
 [601] "tree"           "nor"            "a"              "house"         
 [605] "broke"          "the"            "broad"          "sweep"         
 [609] "of"             "flat"           "country"        "that"          
 [613] "reached"        "to"             "the"            "edge"          
 [617] "of"             "the"            "sky"            "in"            
 [621] "all"            "directions."    "The"            "sun"           
 [625] "had"            "baked"          "the"            "plowed"        
 [629] "land"           "into"           "a"              "gray"          
 [633] "mass,"          "with"           "little"         "cracks"        
 [637] "running"        "through"        "it."            "Even"          
 [641] "the"            "grass"          "was"            "not"           
 [645] "green,"         "for"            "the"            "sun"           
 [649] "had"            "burned"         "the"            "tops"          
 [653] "of"             "the"            "long"           "blades"        
 [657] "until"          "they"           "were"           "the"           
 [661] "same"           "gray"           "color"          "to"            
 [665] "be"             "seen"           "everywhere."    "Once"          
 [669] "the"            "house"          "had"            "been"          
 [673] "painted,"       "but"            "the"            "sun"           
 [677] "blistered"      "the"            "paint"          "and"           
 [681] "the"            "rains"          "washed"         "it"            
 [685] "away,"          "and"            "now"            "the"           
 [689] "house"          "was"            "as"             "dull"          
 [693] "and"            "gray"           "as"             "everything"    
 [697] "else."          ""               "When"           "Aunt"          
 [701] "Em"             "came"           "there"          "to"            
 [705] "live"           "she"            "was"            "a"             
 [709] "young,"         "pretty"         "wife."          "The"           
 [713] "sun"            "and"            "wind"           "had"           
 [717] "changed"        "her,"           "too."           "They"          
 [721] "had"            "taken"          "the"            "sparkle"       
 [725] "from"           "her"            "eyes"           "and"           
 [729] "left"           "them"           "a"              "sober"         
 [733] "gray;"          "they"           "had"            "taken"         
 [737] "the"            "red"            "from"           "her"           
 [741] "cheeks"         "and"            "lips,"          "and"           
 [745] "they"           "were"           "gray"           "also."         
 [749] "She"            "was"            "thin"           "and"           
 [753] "gaunt,"         "and"            "never"          "smiled"        
 [757] "now."           "When"           "Dorothy,"       "who"           
 [761] "was"            "an"             "orphan,"        "first"         
 [765] "came"           "to"             "her,"           "Aunt"          
 [769] "Em"             "had"            "been"           "so"            
 [773] "startled"       "by"             "the"            "child's"       
 [777] "laughter"       "that"           "she"            "would"         
 [781] "scream"         "and"            "press"          "her"           
 [785] "hand"           "upon"           "her"            "heart"         
 [789] "whenever"       "Dorothy's"      "merry"          "voice"         
 [793] "reached"        "her"            "ears;"          "and"           
 [797] "she"            "still"          "looked"         "at"            
 [801] "the"            "little"         "girl"           "with"          
 [805] "wonder"         "that"           "she"            "could"         
 [809] "find"           "anything"       "to"             "laugh"         
 [813] "at."            ""               "Uncle"          "Henry"         
 [817] "never"          "laughed."       "He"             "worked"        
 [821] "hard"           "from"           "morning"        "till"          
 [825] "night"          "and"            "did"            "not"           
 [829] "know"           "what"           "joy"            "was."          
 [833] "He"             "was"            "gray"           "also,"         
 [837] "from"           "his"            "long"           "beard"         
 [841] "to"             "his"            "rough"          "boots,"        
 [845] "and"            "he"             "looked"         "stern"         
 [849] "and"            "solemn,"        "and"            "rarely"        
 [853] "spoke."         ""               "It"             "was"           
 [857] "Toto"           "that"           "made"           "Dorothy"       
 [861] "laugh,"         "and"            "saved"          "her"           
 [865] "from"           "growing"        "as"             "gray"          
 [869] "as"             "her"            "other"          "surroundings." 
 [873] "Toto"           "was"            "not"            "gray;"         
 [877] "he"             "was"            "a"              "little"        
 [881] "black"          "dog,"           "with"           "long"          
 [885] "silky"          "hair"           "and"            "small"         
 [889] "black"          "eyes"           "that"           "twinkled"      
 [893] "merrily"        "on"             "either"         "side"          
 [897] "of"             "his"            "funny,"         "wee"           
 [901] "nose."          "Toto"           "played"         "all"           
 [905] "day"            "long,"          "and"            "Dorothy"       
 [909] "played"         "with"           "him,"           "and"           
 [913] "loved"          "him"            "dearly."        ""              
 [917] "Today,"         "however,"       "they"           "were"          
 [921] "not"            "playing."       "Uncle"          "Henry"         
 [925] "sat"            "upon"           "the"            "doorstep"      
 [929] "and"            "looked"         "anxiously"      "at"            
 [933] "the"            "sky,"           "which"          "was"           
 [937] "even"           "grayer"         "than"           "usual."        
 [941] "Dorothy"        "stood"          "in"             "the"           
 [945] "door"           "with"           "Toto"           "in"            
 [949] "her"            "arms,"          "and"            "looked"        
 [953] "at"             "the"            "sky"            "too."          
 [957] "Aunt"           "Em"             "was"            "washing"       
 [961] "the"            "dishes."        ""               "From"          
 [965] "the"            "far"            "north"          "they"          
 [969] "heard"          "a"              "low"            "wail"          
 [973] "of"             "the"            "wind,"          "and"           
 [977] "Uncle"          "Henry"          "and"            "Dorothy"       
 [981] "could"          "see"            "where"          "the"           
 [985] "long"           "grass"          "bowed"          "in"            
 [989] "waves"          "before"         "the"            "coming"        
 [993] "storm."         "There"          "now"            "came"          
 [997] "a"              "sharp"          "whistling"      "in"            

Strip Leading and Trailing Punctuation

In ICU Unicode, \p{P} is a shortcut for any punctuation character:

Code
ozWords <-
  ozwds %>% 
  # strip leading punctuation:
  str_replace(
    pattern = "^\\p{P}+",
    replacement = ""
  ) %>%   
  # strip trailing punctuation:
  str_replace(
    pattern = "\\p{P}+$",
    replacement = ""
  ) %>%   
  str_to_lower() %>% 
  unique() %>% 
  str_sort()
ozWords[1:1000]
   [1] ""                 "1"                "10"              
   [4] "11"               "12"               "13"              
   [7] "14"               "15"               "16"              
  [10] "17"               "18"               "19"              
  [13] "1900"             "2"                "20"              
  [16] "21"               "22"               "23"              
  [19] "24"               "3"                "4"               
  [22] "5"                "6"                "7"               
  [25] "8"                "9"                "a"               
  [28] "able"             "abounding"        "about"           
  [31] "above"            "abundance"        "accident"        
  [34] "accidents"        "account"          "accounts"        
  [37] "ached"            "across"           "act"             
  [40] "action"           "actually"         "add"             
  [43] "added"            "addition"         "admit"           
  [46] "admitted"         "advanced"         "adventure"       
  [49] "adventures"       "advice"           "advise"          
  [52] "afford"           "afraid"           "afresh"          
  [55] "after"            "afternoon"        "afterward"       
  [58] "afterwards"       "again"            "against"         
  [61] "age"              "ages"             "ago"             
  [64] "agree"            "agreed"           "ah"              
  [67] "ahead"            "air"              "airtight"        
  [70] "alarm"            "alas"             "alive"           
  [73] "all"              "allow"            "allowed"         
  [76] "almost"           "alone"            "along"           
  [79] "already"          "also"             "although"        
  [82] "altogether"       "always"           "am"              
  [85] "amazement"        "among"            "amongst"         
  [88] "amuse"            "amused"           "an"              
  [91] "and"              "andersen"         "angered"         
  [94] "angrier"          "angrily"          "angry"           
  [97] "animal"           "animals"          "announced"       
 [100] "another"          "answer"           "answered"        
 [103] "ant"              "anxious"          "anxiously"       
 [106] "any"              "anybody"          "anyone"          
 [109] "anything"         "anyway"           "anywhere"        
 [112] "appeared"         "appears"          "approach"        
 [115] "approached"       "approve"          "april"           
 [118] "apron"            "arched"           "are"             
 [121] "aren't"           "arm"              "armed"           
 [124] "armless"          "arms"             "army"            
 [127] "arose"            "around"           "aroused"         
 [130] "arrive"           "art"              "arts"            
 [133] "as"               "ashamed"          "ask"             
 [136] "asked"            "asleep"           "aspires"         
 [139] "assemblage"       "assistance"       "assorted"        
 [142] "astonished"       "at"               "ate"             
 [145] "attack"           "attacked"         "attempt"         
 [148] "attended"         "audience"         "aunt"            
 [151] "authors"          "awake"            "awaken"          
 [154] "awakened"         "away"             "awful"           
 [157] "awfully"          "awkward"          "awoke"           
 [160] "axe"              "axe-handle"       "babies"          
 [163] "baby"             "back"             "backs"           
 [166] "backward"         "bad"              "bade"            
 [169] "badly"            "bag"              "baked"           
 [172] "balanced"         "bald"             "ball"            
 [175] "balloon"          "balloonist"       "balls"           
 [178] "band"             "bands"            "bank"            
 [181] "banks"            "bar"              "bark"            
 [184] "barked"           "barking"          "barn"            
 [187] "barns"            "barnyard"         "bars"            
 [190] "basin"            "basket"           "baskets"         
 [193] "bath"             "bathed"           "bathing"         
 [196] "battered"         "battle"           "baum"            
 [199] "be"               "beam"             "bear"            
 [202] "beard"            "beards"           "bearing"         
 [205] "bears"            "beast"            "beast's"         
 [208] "beasts"           "beat"             "beaten"          
 [211] "beating"          "beautiful"        "beautifully"     
 [214] "beauty"           "became"           "because"         
 [217] "beckon"           "become"           "becomes"         
 [220] "bed"              "beds"             "been"            
 [223] "bees"             "beetle"           "before"          
 [226] "beg"              "began"            "begged"          
 [229] "begin"            "beginning"        "begins"          
 [232] "begun"            "behind"           "behold"          
 [235] "being"            "believe"          "bell"            
 [238] "bells"            "belong"           "belonged"        
 [241] "belongs"          "below"            "bend"            
 [244] "bending"          "bent"             "beside"          
 [247] "besides"          "best"             "bestow"          
 [250] "better"           "between"          "bewilderment"    
 [253] "beyond"           "bidding"          "big"             
 [256] "bigger"           "biggest"          "bird"            
 [259] "birds"            "bit"              "bite"            
 [262] "bites"            "biting"           "bitten"          
 [265] "bitterly"         "black"            "blade"           
 [268] "blades"           "bleed"            "bless"           
 [271] "blew"             "blind"            "blinded"         
 [274] "blistered"        "block"            "blocks"          
 [277] "blood"            "blood-curdling"   "blossoms"        
 [280] "blow"             "blue"             "blunted"         
 [283] "boards"           "bodices"          "bodies"          
 [286] "body"             "bogs"             "boisterous"      
 [289] "boldly"           "bondage"          "books"           
 [292] "boots"            "boq"              "bordered"        
 [295] "born"             "both"             "bother"          
 [298] "bottle"           "bottom"           "bought"          
 [301] "bounded"          "bounding"         "bow"             
 [304] "bowed"            "bowing"           "bowl"            
 [307] "box"              "boy"              "bracelet"        
 [310] "braid"            "brain"            "brains"          
 [313] "bran"             "bran-new"         "branch"          
 [316] "branches"         "brave"            "bravely"         
 [319] "braver"           "bread"            "break"           
 [322] "breakfast"        "breakfasted"      "breaking"        
 [325] "breast"           "breath"           "breathe"         
 [328] "breathed"         "breathes"         "breathing"       
 [331] "breeches"         "breeze"           "brick"           
 [334] "bricks"           "bride"            "bridge"          
 [337] "bridges"          "bright"           "brighter"        
 [340] "brightest"        "brightly"         "brightness"      
 [343] "brilliancy"       "brilliant"        "brims"           
 [346] "bring"            "bringing"         "brings"          
 [349] "brisk"            "briskly"          "brittle"         
 [352] "broad"            "brocaded"         "broke"           
 [355] "broken"           "brook"            "brooks"          
 [358] "brother"          "brought"          "brown"           
 [361] "brownie"          "bruised"          "brutes"          
 [364] "bucket"           "buckles"          "bug"             
 [367] "build"            "building"         "built"           
 [370] "bulged"           "bundle"           "burn"            
 [373] "burned"           "burning"          "burnished"       
 [376] "burst"            "bushes"           "business"        
 [379] "busy"             "but"              "butter"          
 [382] "buttercups"       "butterflies"      "butterfly"       
 [385] "button"           "buzzing"          "by"              
 [388] "cabbages"         "cabinets"         "cackling"        
 [391] "cake"             "cakes"            "call"            
 [394] "called"           "calling"          "calmly"          
 [397] "came"             "camp"             "can"             
 [400] "can't"            "candy"            "cannon"          
 [403] "cannot"           "cap"              "caps"            
 [406] "care"             "cared"            "careful"         
 [409] "carefully"        "careless"         "carpet"          
 [412] "carpeted"         "carried"          "carries"         
 [415] "carry"            "carrying"         "carts"           
 [418] "carved"           "case"             "cast"            
 [421] "castle"           "cat"              "catch"           
 [424] "caught"           "caused"           "ceiling"         
 [427] "celebrate"        "cellar"           "center"          
 [430] "certain"          "certainly"        "chain"           
 [433] "chair"            "chairs"           "chalk"           
 [436] "chamber"          "chance"           "change"          
 [439] "changed"          "chariot"          "charm"           
 [442] "chase"            "chased"           "chasing"         
 [445] "chatted"          "chattering"       "checked"         
 [448] "checks"           "cheeks"           "cheer"           
 [451] "cheerful"         "cheerfully"       "cheers"          
 [454] "chest"            "chicago"          "chickens"        
 [457] "child"            "child's"          "childhood"       
 [460] "childish"         "children"         "children's"      
 [463] "chin"             "china"            "chop"            
 [466] "chopped"          "chopping"         "chorus"          
 [469] "chose"            "chubby"           "church"          
 [472] "circle"           "circus"           "city"            
 [475] "civilized"        "claim"            "clapped"         
 [478] "clapping"         "clasping"         "classed"         
 [481] "clatter"          "claws"            "clean"           
 [484] "cleaned"          "clear"            "cleared"         
 [487] "climb"            "climbed"          "climbing"        
 [490] "clinging"         "clings"           "close"           
 [493] "closed"           "closely"          "closer"          
 [496] "closing"          "cloth"            "clothed"         
 [499] "clothes"          "clothing"         "cloud"           
 [502] "clouds"           "clown"            "clowns"          
 [505] "clump"            "clumsy"           "clusters"        
 [508] "coal"             "coarse"           "coat"            
 [511] "coated"           "cock"             "coils"           
 [514] "cold"             "collar"           "collection"      
 [517] "color"            "colored"          "colors"          
 [520] "combed"           "come"             "comes"           
 [523] "comfort"          "comfortable"      "comforted"       
 [526] "coming"           "command"          "commanded"       
 [529] "commands"         "common"           "companion"       
 [532] "companions"       "company"          "compel"          
 [535] "completely"       "comrade"          "comrades"        
 [538] "condition"        "confidence"       "confidentially"  
 [541] "congratulate"     "connected"        "consider"        
 [544] "considerably"     "constantly"       "contained"       
 [547] "content"          "contented"        "contentedly"     
 [550] "contents"         "continue"         "continued"       
 [553] "converse"         "cooked"           "cookies"         
 [556] "cooking"          "cookstove"        "cool"            
 [559] "corn"             "corner"           "cornfield"       
 [562] "cost"             "costume"          "costumes"        
 [565] "cottage"          "cotton"           "couch"           
 [568] "could"            "couldn't"         "council"         
 [571] "counted"          "counterpane"      "countless"       
 [574] "countries"        "country"          "courage"         
 [577] "courageous"       "course"           "court"           
 [580] "courtyard"        "cover"            "covered"         
 [583] "covering"         "cow"              "cow's"           
 [586] "coward"           "cowardly"         "cows"            
 [589] "cozy"             "crack"            "cracked"         
 [592] "cracks"           "cradle"           "crash"           
 [595] "crawled"          "crawling"         "crawls"          
 [598] "creations"        "creature"         "creatures"       
 [601] "creeping"         "crept"            "cried"           
 [604] "crops"            "cross"            "crossed"         
 [607] "crouched"         "crow"             "crowd"           
 [610] "crowed"           "crowing"          "crown"           
 [613] "crowns"           "crows"            "cruel"           
 [616] "cruelty"          "crush"            "crushed"         
 [619] "cry"              "crying"           "cunning"         
 [622] "cupboard"         "curiosity"        "curious"         
 [625] "curiously"        "curled"           "current"         
 [628] "curtsy"           "cut"              "cyclone"         
 [631] "dainty"           "daisies"          "damaged"         
 [634] "dance"            "dancing"          "danger"          
 [637] "dangerous"        "dangers"          "dare"            
 [640] "dared"            "dark"             "darken"          
 [643] "darkened"         "darkness"         "darling"         
 [646] "dashed"           "daunted"          "day"             
 [649] "day's"            "daylight"         "days"            
 [652] "dazzled"          "dead"             "deadly"          
 [655] "deaf"             "deal"             "dear"            
 [658] "dearly"           "death"            "deceive"         
 [661] "deceived"         "decide"           "decided"         
 [664] "declared"         "deed"             "deeds"           
 [667] "deep"             "deeply"           "deer"            
 [670] "delicious"        "delight"          "delighted"       
 [673] "delightful"       "demand"           "demanded"        
 [676] "dented"           "dents"            "deprive"         
 [679] "depths"           "desert"           "deserted"        
 [682] "desire"           "desired"          "despairing"      
 [685] "despairingly"     "destroy"          "destroyed"       
 [688] "destroying"       "determined"       "devised"         
 [691] "dew"              "diamonds"         "did"             
 [694] "didn't"           "die"              "died"            
 [697] "dies"             "different"        "difficult"       
 [700] "dignified"        "dinner"           "direction"       
 [703] "directions"       "directly"         "disagreeable"    
 [706] "disappear"        "disappeared"      "disappointed"    
 [709] "disappointment"   "discomfort"       "discouragements" 
 [712] "discover"         "discovered"       "discovery"       
 [715] "disease"          "disgust"          "dish"            
 [718] "dishes"           "dismal"           "dismally"        
 [721] "dismay"           "dispenses"        "distance"        
 [724] "disturbed"        "ditch"            "divided"         
 [727] "dizzy"            "do"               "does"            
 [730] "doesn't"          "dog"              "dogs"            
 [733] "doing"            "dome"             "dominions"       
 [736] "don't"            "done"             "door"            
 [739] "doors"            "doorstep"         "doorway"         
 [742] "dorothy"          "dorothy's"        "dose"            
 [745] "dotted"           "doublets"         "doubtless"       
 [748] "down"             "downstream"       "dozen"           
 [751] "dragged"          "drags"            "drank"           
 [754] "draw"             "drawers"          "drawing"         
 [757] "drawing-room"     "drawn"            "dread"           
 [760] "dreadful"         "dreadful-looking" "dreadfully"      
 [763] "dream"            "dreamed"          "dreary"          
 [766] "dress"            "dressed"          "dresses"         
 [769] "drew"             "dried"            "drink"           
 [772] "drive"            "driven"           "drop"            
 [775] "dropped"          "drown"            "dry"             
 [778] "drying"           "due"              "dug"             
 [781] "dull"             "during"           "dust"            
 [784] "dwarf"            "dwellings"        "dwells"          
 [787] "dwelt"            "each"             "eagerly"         
 [790] "ear"              "early"            "earn"            
 [793] "earned"           "earnestly"        "ears"            
 [796] "earth"            "ease"             "easier"          
 [799] "easiest"          "easily"           "east"            
 [802] "easy"             "eat"              "eaten"           
 [805] "eating"           "eats"             "echoed"          
 [808] "edge"             "education"        "egg"             
 [811] "eggs"             "eight"            "either"          
 [814] "elbow"            "elephant"         "elephants"       
 [817] "eliminated"       "else"             "em"              
 [820] "em's"             "emerald"          "emerald-green"   
 [823] "emeralds"         "emptied"          "empty"           
 [826] "enabled"          "enchant"          "enchanted"       
 [829] "end"              "ended"            "enemy"           
 [832] "engaged"          "enjoyed"          "enjoying"        
 [835] "enormous"         "enough"           "enslave"         
 [838] "enter"            "entered"          "entering"        
 [841] "entertainment"    "entire"           "entirely"        
 [844] "ep-pe"            "ermine"           "errand"          
 [847] "escape"           "even"             "evening"         
 [850] "eventful"         "ever"             "evermore"        
 [853] "every"            "everybody"        "everyone"        
 [856] "everything"       "everywhere"       "evidently"       
 [859] "evil"             "exact"            "exactly"         
 [862] "except"           "exclaimed"        "excuse"          
 [865] "exhausted"        "expect"           "expected"        
 [868] "experience"       "explained"        "extend"          
 [871] "extra-large"      "eye"              "eyes"            
 [874] "face"             "faces"            "facing"          
 [877] "fact"             "faded"            "fail"            
 [880] "failed"           "faintest"         "fair"            
 [883] "fairies"          "fairly"           "fairy"           
 [886] "fall"             "fallen"           "falling"         
 [889] "falls"            "false"            "family"          
 [892] "fancy"            "fantastic"        "far"             
 [895] "farewell"         "farmer"           "farmer's"        
 [898] "farmers"          "farmhouse"        "farms"           
 [901] "farther"          "fashion"          "fast"            
 [904] "fasten"           "fastened"         "fastening"       
 [907] "fat"              "fate"             "father"          
 [910] "favor"            "favorite"         "favors"          
 [913] "fear"             "feared"           "fearing"         
 [916] "fears"            "fearsome"         "feasting"        
 [919] "feather"          "fed"              "feel"            
 [922] "feeling"          "feet"             "fell"            
 [925] "fellow"           "felt"             "fence"           
 [928] "fences"           "fetch"            "few"             
 [931] "fewer"            "fiddlers"         "field"           
 [934] "fields"           "fierce"           "fiercely"        
 [937] "fight"            "fighting"         "figure"          
 [940] "fill"             "filled"           "finally"         
 [943] "find"             "finding"          "fine"            
 [946] "fingers"          "finished"         "fire"            
 [949] "first"            "first-rate"       "fit"             
 [952] "fitted"           "five"             "flames"          
 [955] "flapping"         "flat"             "flattened"       
 [958] "flesh"            "flew"             "flight"          
 [961] "flights"          "float"            "floated"         
 [964] "floating"         "flock"            "flooding"        
 [967] "floor"            "flower"           "flowers"         
 [970] "flowing"          "flung"            "fluttered"       
 [973] "fly"              "flying"           "foe"             
 [976] "folding"          "folk"             "folklore"        
 [979] "follow"           "followed"         "following"       
 [982] "fond"             "food"             "fool"            
 [985] "fooled"           "foolish"          "foot"            
 [988] "footing"          "for"              "force"           
 [991] "forced"           "forefoot"         "forehead"        
 [994] "forest"           "forever"          "forgave"         
 [997] "forget"           "forgive"          "forgot"          
[1000] "form"            

Further Refinements

We see that we need to get rid of some numbers and a spurious empty string:

isNumber <- str_detect(ozWords, pattern = "^\\d+")
isEmpty <- ozWords == ""
validWord <- !isNumber & !isEmpty
ozWords <- ozWords[validWord]
ozWords[1:1000]
   [1] "a"                "able"             "abounding"       
   [4] "about"            "above"            "abundance"       
   [7] "accident"         "accidents"        "account"         
  [10] "accounts"         "ached"            "across"          
  [13] "act"              "action"           "actually"        
  [16] "add"              "added"            "addition"        
  [19] "admit"            "admitted"         "advanced"        
  [22] "adventure"        "adventures"       "advice"          
  [25] "advise"           "afford"           "afraid"          
  [28] "afresh"           "after"            "afternoon"       
  [31] "afterward"        "afterwards"       "again"           
  [34] "against"          "age"              "ages"            
  [37] "ago"              "agree"            "agreed"          
  [40] "ah"               "ahead"            "air"             
  [43] "airtight"         "alarm"            "alas"            
  [46] "alive"            "all"              "allow"           
  [49] "allowed"          "almost"           "alone"           
  [52] "along"            "already"          "also"            
  [55] "although"         "altogether"       "always"          
  [58] "am"               "amazement"        "among"           
  [61] "amongst"          "amuse"            "amused"          
  [64] "an"               "and"              "andersen"        
  [67] "angered"          "angrier"          "angrily"         
  [70] "angry"            "animal"           "animals"         
  [73] "announced"        "another"          "answer"          
  [76] "answered"         "ant"              "anxious"         
  [79] "anxiously"        "any"              "anybody"         
  [82] "anyone"           "anything"         "anyway"          
  [85] "anywhere"         "appeared"         "appears"         
  [88] "approach"         "approached"       "approve"         
  [91] "april"            "apron"            "arched"          
  [94] "are"              "aren't"           "arm"             
  [97] "armed"            "armless"          "arms"            
 [100] "army"             "arose"            "around"          
 [103] "aroused"          "arrive"           "art"             
 [106] "arts"             "as"               "ashamed"         
 [109] "ask"              "asked"            "asleep"          
 [112] "aspires"          "assemblage"       "assistance"      
 [115] "assorted"         "astonished"       "at"              
 [118] "ate"              "attack"           "attacked"        
 [121] "attempt"          "attended"         "audience"        
 [124] "aunt"             "authors"          "awake"           
 [127] "awaken"           "awakened"         "away"            
 [130] "awful"            "awfully"          "awkward"         
 [133] "awoke"            "axe"              "axe-handle"      
 [136] "babies"           "baby"             "back"            
 [139] "backs"            "backward"         "bad"             
 [142] "bade"             "badly"            "bag"             
 [145] "baked"            "balanced"         "bald"            
 [148] "ball"             "balloon"          "balloonist"      
 [151] "balls"            "band"             "bands"           
 [154] "bank"             "banks"            "bar"             
 [157] "bark"             "barked"           "barking"         
 [160] "barn"             "barns"            "barnyard"        
 [163] "bars"             "basin"            "basket"          
 [166] "baskets"          "bath"             "bathed"          
 [169] "bathing"          "battered"         "battle"          
 [172] "baum"             "be"               "beam"            
 [175] "bear"             "beard"            "beards"          
 [178] "bearing"          "bears"            "beast"           
 [181] "beast's"          "beasts"           "beat"            
 [184] "beaten"           "beating"          "beautiful"       
 [187] "beautifully"      "beauty"           "became"          
 [190] "because"          "beckon"           "become"          
 [193] "becomes"          "bed"              "beds"            
 [196] "been"             "bees"             "beetle"          
 [199] "before"           "beg"              "began"           
 [202] "begged"           "begin"            "beginning"       
 [205] "begins"           "begun"            "behind"          
 [208] "behold"           "being"            "believe"         
 [211] "bell"             "bells"            "belong"          
 [214] "belonged"         "belongs"          "below"           
 [217] "bend"             "bending"          "bent"            
 [220] "beside"           "besides"          "best"            
 [223] "bestow"           "better"           "between"         
 [226] "bewilderment"     "beyond"           "bidding"         
 [229] "big"              "bigger"           "biggest"         
 [232] "bird"             "birds"            "bit"             
 [235] "bite"             "bites"            "biting"          
 [238] "bitten"           "bitterly"         "black"           
 [241] "blade"            "blades"           "bleed"           
 [244] "bless"            "blew"             "blind"           
 [247] "blinded"          "blistered"        "block"           
 [250] "blocks"           "blood"            "blood-curdling"  
 [253] "blossoms"         "blow"             "blue"            
 [256] "blunted"          "boards"           "bodices"         
 [259] "bodies"           "body"             "bogs"            
 [262] "boisterous"       "boldly"           "bondage"         
 [265] "books"            "boots"            "boq"             
 [268] "bordered"         "born"             "both"            
 [271] "bother"           "bottle"           "bottom"          
 [274] "bought"           "bounded"          "bounding"        
 [277] "bow"              "bowed"            "bowing"          
 [280] "bowl"             "box"              "boy"             
 [283] "bracelet"         "braid"            "brain"           
 [286] "brains"           "bran"             "bran-new"        
 [289] "branch"           "branches"         "brave"           
 [292] "bravely"          "braver"           "bread"           
 [295] "break"            "breakfast"        "breakfasted"     
 [298] "breaking"         "breast"           "breath"          
 [301] "breathe"          "breathed"         "breathes"        
 [304] "breathing"        "breeches"         "breeze"          
 [307] "brick"            "bricks"           "bride"           
 [310] "bridge"           "bridges"          "bright"          
 [313] "brighter"         "brightest"        "brightly"        
 [316] "brightness"       "brilliancy"       "brilliant"       
 [319] "brims"            "bring"            "bringing"        
 [322] "brings"           "brisk"            "briskly"         
 [325] "brittle"          "broad"            "brocaded"        
 [328] "broke"            "broken"           "brook"           
 [331] "brooks"           "brother"          "brought"         
 [334] "brown"            "brownie"          "bruised"         
 [337] "brutes"           "bucket"           "buckles"         
 [340] "bug"              "build"            "building"        
 [343] "built"            "bulged"           "bundle"          
 [346] "burn"             "burned"           "burning"         
 [349] "burnished"        "burst"            "bushes"          
 [352] "business"         "busy"             "but"             
 [355] "butter"           "buttercups"       "butterflies"     
 [358] "butterfly"        "button"           "buzzing"         
 [361] "by"               "cabbages"         "cabinets"        
 [364] "cackling"         "cake"             "cakes"           
 [367] "call"             "called"           "calling"         
 [370] "calmly"           "came"             "camp"            
 [373] "can"              "can't"            "candy"           
 [376] "cannon"           "cannot"           "cap"             
 [379] "caps"             "care"             "cared"           
 [382] "careful"          "carefully"        "careless"        
 [385] "carpet"           "carpeted"         "carried"         
 [388] "carries"          "carry"            "carrying"        
 [391] "carts"            "carved"           "case"            
 [394] "cast"             "castle"           "cat"             
 [397] "catch"            "caught"           "caused"          
 [400] "ceiling"          "celebrate"        "cellar"          
 [403] "center"           "certain"          "certainly"       
 [406] "chain"            "chair"            "chairs"          
 [409] "chalk"            "chamber"          "chance"          
 [412] "change"           "changed"          "chariot"         
 [415] "charm"            "chase"            "chased"          
 [418] "chasing"          "chatted"          "chattering"      
 [421] "checked"          "checks"           "cheeks"          
 [424] "cheer"            "cheerful"         "cheerfully"      
 [427] "cheers"           "chest"            "chicago"         
 [430] "chickens"         "child"            "child's"         
 [433] "childhood"        "childish"         "children"        
 [436] "children's"       "chin"             "china"           
 [439] "chop"             "chopped"          "chopping"        
 [442] "chorus"           "chose"            "chubby"          
 [445] "church"           "circle"           "circus"          
 [448] "city"             "civilized"        "claim"           
 [451] "clapped"          "clapping"         "clasping"        
 [454] "classed"          "clatter"          "claws"           
 [457] "clean"            "cleaned"          "clear"           
 [460] "cleared"          "climb"            "climbed"         
 [463] "climbing"         "clinging"         "clings"          
 [466] "close"            "closed"           "closely"         
 [469] "closer"           "closing"          "cloth"           
 [472] "clothed"          "clothes"          "clothing"        
 [475] "cloud"            "clouds"           "clown"           
 [478] "clowns"           "clump"            "clumsy"          
 [481] "clusters"         "coal"             "coarse"          
 [484] "coat"             "coated"           "cock"            
 [487] "coils"            "cold"             "collar"          
 [490] "collection"       "color"            "colored"         
 [493] "colors"           "combed"           "come"            
 [496] "comes"            "comfort"          "comfortable"     
 [499] "comforted"        "coming"           "command"         
 [502] "commanded"        "commands"         "common"          
 [505] "companion"        "companions"       "company"         
 [508] "compel"           "completely"       "comrade"         
 [511] "comrades"         "condition"        "confidence"      
 [514] "confidentially"   "congratulate"     "connected"       
 [517] "consider"         "considerably"     "constantly"      
 [520] "contained"        "content"          "contented"       
 [523] "contentedly"      "contents"         "continue"        
 [526] "continued"        "converse"         "cooked"          
 [529] "cookies"          "cooking"          "cookstove"       
 [532] "cool"             "corn"             "corner"          
 [535] "cornfield"        "cost"             "costume"         
 [538] "costumes"         "cottage"          "cotton"          
 [541] "couch"            "could"            "couldn't"        
 [544] "council"          "counted"          "counterpane"     
 [547] "countless"        "countries"        "country"         
 [550] "courage"          "courageous"       "course"          
 [553] "court"            "courtyard"        "cover"           
 [556] "covered"          "covering"         "cow"             
 [559] "cow's"            "coward"           "cowardly"        
 [562] "cows"             "cozy"             "crack"           
 [565] "cracked"          "cracks"           "cradle"          
 [568] "crash"            "crawled"          "crawling"        
 [571] "crawls"           "creations"        "creature"        
 [574] "creatures"        "creeping"         "crept"           
 [577] "cried"            "crops"            "cross"           
 [580] "crossed"          "crouched"         "crow"            
 [583] "crowd"            "crowed"           "crowing"         
 [586] "crown"            "crowns"           "crows"           
 [589] "cruel"            "cruelty"          "crush"           
 [592] "crushed"          "cry"              "crying"          
 [595] "cunning"          "cupboard"         "curiosity"       
 [598] "curious"          "curiously"        "curled"          
 [601] "current"          "curtsy"           "cut"             
 [604] "cyclone"          "dainty"           "daisies"         
 [607] "damaged"          "dance"            "dancing"         
 [610] "danger"           "dangerous"        "dangers"         
 [613] "dare"             "dared"            "dark"            
 [616] "darken"           "darkened"         "darkness"        
 [619] "darling"          "dashed"           "daunted"         
 [622] "day"              "day's"            "daylight"        
 [625] "days"             "dazzled"          "dead"            
 [628] "deadly"           "deaf"             "deal"            
 [631] "dear"             "dearly"           "death"           
 [634] "deceive"          "deceived"         "decide"          
 [637] "decided"          "declared"         "deed"            
 [640] "deeds"            "deep"             "deeply"          
 [643] "deer"             "delicious"        "delight"         
 [646] "delighted"        "delightful"       "demand"          
 [649] "demanded"         "dented"           "dents"           
 [652] "deprive"          "depths"           "desert"          
 [655] "deserted"         "desire"           "desired"         
 [658] "despairing"       "despairingly"     "destroy"         
 [661] "destroyed"        "destroying"       "determined"      
 [664] "devised"          "dew"              "diamonds"        
 [667] "did"              "didn't"           "die"             
 [670] "died"             "dies"             "different"       
 [673] "difficult"        "dignified"        "dinner"          
 [676] "direction"        "directions"       "directly"        
 [679] "disagreeable"     "disappear"        "disappeared"     
 [682] "disappointed"     "disappointment"   "discomfort"      
 [685] "discouragements"  "discover"         "discovered"      
 [688] "discovery"        "disease"          "disgust"         
 [691] "dish"             "dishes"           "dismal"          
 [694] "dismally"         "dismay"           "dispenses"       
 [697] "distance"         "disturbed"        "ditch"           
 [700] "divided"          "dizzy"            "do"              
 [703] "does"             "doesn't"          "dog"             
 [706] "dogs"             "doing"            "dome"            
 [709] "dominions"        "don't"            "done"            
 [712] "door"             "doors"            "doorstep"        
 [715] "doorway"          "dorothy"          "dorothy's"       
 [718] "dose"             "dotted"           "doublets"        
 [721] "doubtless"        "down"             "downstream"      
 [724] "dozen"            "dragged"          "drags"           
 [727] "drank"            "draw"             "drawers"         
 [730] "drawing"          "drawing-room"     "drawn"           
 [733] "dread"            "dreadful"         "dreadful-looking"
 [736] "dreadfully"       "dream"            "dreamed"         
 [739] "dreary"           "dress"            "dressed"         
 [742] "dresses"          "drew"             "dried"           
 [745] "drink"            "drive"            "driven"          
 [748] "drop"             "dropped"          "drown"           
 [751] "dry"              "drying"           "due"             
 [754] "dug"              "dull"             "during"          
 [757] "dust"             "dwarf"            "dwellings"       
 [760] "dwells"           "dwelt"            "each"            
 [763] "eagerly"          "ear"              "early"           
 [766] "earn"             "earned"           "earnestly"       
 [769] "ears"             "earth"            "ease"            
 [772] "easier"           "easiest"          "easily"          
 [775] "east"             "easy"             "eat"             
 [778] "eaten"            "eating"           "eats"            
 [781] "echoed"           "edge"             "education"       
 [784] "egg"              "eggs"             "eight"           
 [787] "either"           "elbow"            "elephant"        
 [790] "elephants"        "eliminated"       "else"            
 [793] "em"               "em's"             "emerald"         
 [796] "emerald-green"    "emeralds"         "emptied"         
 [799] "empty"            "enabled"          "enchant"         
 [802] "enchanted"        "end"              "ended"           
 [805] "enemy"            "engaged"          "enjoyed"         
 [808] "enjoying"         "enormous"         "enough"          
 [811] "enslave"          "enter"            "entered"         
 [814] "entering"         "entertainment"    "entire"          
 [817] "entirely"         "ep-pe"            "ermine"          
 [820] "errand"           "escape"           "even"            
 [823] "evening"          "eventful"         "ever"            
 [826] "evermore"         "every"            "everybody"       
 [829] "everyone"         "everything"       "everywhere"      
 [832] "evidently"        "evil"             "exact"           
 [835] "exactly"          "except"           "exclaimed"       
 [838] "excuse"           "exhausted"        "expect"          
 [841] "expected"         "experience"       "explained"       
 [844] "extend"           "extra-large"      "eye"             
 [847] "eyes"             "face"             "faces"           
 [850] "facing"           "fact"             "faded"           
 [853] "fail"             "failed"           "faintest"        
 [856] "fair"             "fairies"          "fairly"          
 [859] "fairy"            "fall"             "fallen"          
 [862] "falling"          "falls"            "false"           
 [865] "family"           "fancy"            "fantastic"       
 [868] "far"              "farewell"         "farmer"          
 [871] "farmer's"         "farmers"          "farmhouse"       
 [874] "farms"            "farther"          "fashion"         
 [877] "fast"             "fasten"           "fastened"        
 [880] "fastening"        "fat"              "fate"            
 [883] "father"           "favor"            "favorite"        
 [886] "favors"           "fear"             "feared"          
 [889] "fearing"          "fears"            "fearsome"        
 [892] "feasting"         "feather"          "fed"             
 [895] "feel"             "feeling"          "feet"            
 [898] "fell"             "fellow"           "felt"            
 [901] "fence"            "fences"           "fetch"           
 [904] "few"              "fewer"            "fiddlers"        
 [907] "field"            "fields"           "fierce"          
 [910] "fiercely"         "fight"            "fighting"        
 [913] "figure"           "fill"             "filled"          
 [916] "finally"          "find"             "finding"         
 [919] "fine"             "fingers"          "finished"        
 [922] "fire"             "first"            "first-rate"      
 [925] "fit"              "fitted"           "five"            
 [928] "flames"           "flapping"         "flat"            
 [931] "flattened"        "flesh"            "flew"            
 [934] "flight"           "flights"          "float"           
 [937] "floated"          "floating"         "flock"           
 [940] "flooding"         "floor"            "flower"          
 [943] "flowers"          "flowing"          "flung"           
 [946] "fluttered"        "fly"              "flying"          
 [949] "foe"              "folding"          "folk"            
 [952] "folklore"         "follow"           "followed"        
 [955] "following"        "fond"             "food"            
 [958] "fool"             "fooled"           "foolish"         
 [961] "foot"             "footing"          "for"             
 [964] "force"            "forced"           "forefoot"        
 [967] "forehead"         "forest"           "forever"         
 [970] "forgave"          "forget"           "forgive"         
 [973] "forgot"           "form"             "forms"           
 [976] "forth"            "forthwith"        "fortunate"       
 [979] "fortunately"      "fortune"          "forty"           
 [982] "forward"          "fought"           "found"           
 [985] "fountain"         "four"             "fourth"          
 [988] "foxes"            "fragrance"        "frail"           
 [991] "frank"            "free"             "freed"           
 [994] "freedom"          "freely"           "fresh"           
 [997] "friend"           "friendly"         "friends"         
[1000] "fright"          

Success!

Finally, this seems good enough. We have our lexicon!

Making an Index

A helper function:

indexFactory <- function(lexicon, fn) {
  index <- list()
  fileLines <- readLines(con = fn)
  for (i in seq_len(length(lexicon))) {
    word <- lexicon[i]
    pattern <- str_c("(?i)\\b", word, "\\\b")
    hasWord <- str_detect(fileLines, pattern = pattern)
    index[[word]] <- which(hasWord)
  }
  index
}

Now make the index:

ozIndex <- indexFactory(ozWords, "downloads/oz.txt")

A Look-up Function

ozLookup <- function(word, source) {
  
  lexicon <- ozWords
  index <- ozIndex
  
  file <- readLines(con = source)
  if (!(word %in% lexicon)) {
    message <- paste0("\"", word, "\" is not in the lexicon!\n")
    return(cat(message))
  }
  matchLines <- index[[word]]
  number <- length(matchLines)
  cat(
    "There are ", number, 
    "lines that contain your request.\n\n"
  )
  hrule <- rep("-", times = 30)
  for (i in 1:number) {
    lineNum <- matchLines[i]
    cat(hrule, "\n")
    cat(lineNum, ":  ", file[lineNum], "\n")
  }
}

Try It!

ozLookup("humbug", source = "downloads/oz.txt")
There are  0 lines that contain your request.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
NA :   NA 
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
 :    

Try It Again

ozLookup("lolliop", source = "downloads/oz.txt")
"lolliop" is not in the lexicon!