Scripts for extracting lines and generating shanties

5 years ago · 2dd06bf461
2 changed files with 156 additions and 0 deletions
--- a/lyric-extract.py
+++ b/lyric-extract.py
@ -0,0 +1,51 @@
 ## First, use wget to scrape www.jsward.com for sea shanties
 ## The copy all the HTML's to html/
 from bs4 import BeautifulSoup
 import os
 import re
 os.chdir('/home/bgcarlisle/Projects/Shantybot/')
 files = os.listdir('html')
 os.chdir('html')
 for filename in files:
    with open(filename) as shanty:
        print(filename)
        doc = BeautifulSoup(shanty, 'html.parser')
        doc.head.decompose()
        if doc.find_all("h1"):
            doc.h1.decompose()
        if doc.find_all("h2"):
            doc.h2.decompose()
        text = doc.body.get_text()
        lines = text.split('\n')
        ## Remove empty lines
        lines = [line for line in lines if line.strip()]
        ## Remove Copyright lines
        lines = [line for line in lines if not re.compile(r'Copyright').search(line)]
        ## Remove author credit
        lines = [line for line in lines if not re.compile(r'^By [A-Za-z]+ [A-Za-z]+').search(line)]
        ## Remove author credit
        lines = [line for line in lines if not re.compile(r'^Lyrics from').search(line)]
        ## Remove author credit
        lines = [line for line in lines if not re.compile(r'^Traditional').search(line)]
        ## Remove author credit
        lines = [line for line in lines if not re.compile(r'^[A-Za-z]+ [A-Za-z]+, [0-9]{4}').search(line)]
        with open('../corpus.txt', 'a') as outputfile:
            for line in lines:
                outputfile.write(line + '\n')
 ## Now you have all the sea shanty lines in a single corpus.txt file
--- a/shantybot.py
+++ b/shantybot.py
@ -0,0 +1,105 @@
 import markovify
 import os
 import re
 import textstat
 import pronouncing
 os.chdir('/home/bgcarlisle/Projects/Shantybot/')
 with open ('corpus.txt') as corpus:
    mmodel = markovify.NewlineText(corpus)
 first_and_third_dont_rhyme = True
 second_and_fourth_dont_rhyme = True
 while first_and_third_dont_rhyme or second_and_fourth_dont_rhyme:
    first_line_rhymes_n = 0
    while first_line_rhymes_n < 6:
        first_line = re.sub(r'[,;]$', '', mmodel.make_sentence())
        first_line_syllables = textstat.syllable_count(first_line)
        first_line_words = first_line.split(" ")
        first_line_lastword = first_line_words[len(first_line_words)-1]
        first_line_lastword = re.sub(r'[^A-Za-z]', '', first_line_lastword)
        first_line_rhymes = pronouncing.rhymes(first_line_lastword)
        first_line_rhymes_n = len(first_line_rhymes)
    second_line_rhymes_n = 0
    while second_line_rhymes_n < 6:
        second_line = re.sub(r'[,;]$', '', mmodel.make_sentence())
        second_line_syllables = textstat.syllable_count(second_line)
        second_line_words = second_line.split(" ")
        second_line_lastword = second_line_words[len(second_line_words)-1]
        second_line_lastword = re.sub(r'[^A-Za-z]', '', second_line_lastword)
        second_line_rhymes = pronouncing.rhymes(second_line_lastword)
        second_line_rhymes_n = len(second_line_rhymes)
    third_line_lastword = ""
    third_line_syllables = 0
    third_line_tries = 0
    while (third_line_syllables != first_line_syllables or first_and_third_dont_rhyme) and third_line_tries < 1000:
        third_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000))
        third_line_syllables = textstat.syllable_count(third_line)
        third_line_words = third_line.split(" ")
        third_line_lastword = third_line_words[len(third_line_words)-1]
        ## print(first_line_lastword)
        ## print(third_line_lastword)
        ## print()
        third_line_lastword = re.sub(r'[^A-Za-z]', '', third_line_lastword)
        first_and_third_dont_rhyme = first_line_rhymes.count(third_line_lastword) == 0
        third_line_tries = third_line_tries + 1
    fourth_line_lastword = ""
    fourth_line_syllables = 0
    fourth_line_tries = 0
    while (fourth_line_syllables != second_line_syllables or second_and_fourth_dont_rhyme) and fourth_line_tries < 1000:
        fourth_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000))
        fourth_line_syllables = textstat.syllable_count(fourth_line)
        fourth_line_words = fourth_line.split(" ")
        fourth_line_lastword = fourth_line_words[len(fourth_line_words)-1]
        ## print(second_line_lastword)
        ## print(fourth_line_lastword)
        ## print()
        fourth_line_lastword = re.sub(r'[^A-Za-z]', '', fourth_line_lastword)
        second_and_fourth_dont_rhyme = second_line_rhymes.count(fourth_line_lastword) == 0
        fourth_line_tries = fourth_line_tries + 1
 print(first_line)
 print(second_line)
 print(third_line)
 print(fourth_line)
 print()