From 2dd06bf461529ec2cbe4b6389dfcc70c8380ccb9 Mon Sep 17 00:00:00 2001 From: Benjamin G Carlisle Date: Thu, 4 Feb 2021 13:50:04 +0100 Subject: [PATCH] Scripts for extracting lines and generating shanties --- lyric-extract.py | 51 +++++++++++++++++++++++ shantybot.py | 105 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 156 insertions(+) create mode 100644 lyric-extract.py create mode 100644 shantybot.py diff --git a/lyric-extract.py b/lyric-extract.py new file mode 100644 index 0000000..2830611 --- /dev/null +++ b/lyric-extract.py @@ -0,0 +1,51 @@ +## First, use wget to scrape www.jsward.com for sea shanties +## The copy all the HTML's to html/ + +from bs4 import BeautifulSoup +import os +import re + +os.chdir('/home/bgcarlisle/Projects/Shantybot/') +files = os.listdir('html') +os.chdir('html') + +for filename in files: + with open(filename) as shanty: + print(filename) + doc = BeautifulSoup(shanty, 'html.parser') + + doc.head.decompose() + + if doc.find_all("h1"): + doc.h1.decompose() + + if doc.find_all("h2"): + doc.h2.decompose() + + text = doc.body.get_text() + + lines = text.split('\n') + + ## Remove empty lines + lines = [line for line in lines if line.strip()] + + ## Remove Copyright lines + lines = [line for line in lines if not re.compile(r'Copyright').search(line)] + + ## Remove author credit + lines = [line for line in lines if not re.compile(r'^By [A-Za-z]+ [A-Za-z]+').search(line)] + + ## Remove author credit + lines = [line for line in lines if not re.compile(r'^Lyrics from').search(line)] + + ## Remove author credit + lines = [line for line in lines if not re.compile(r'^Traditional').search(line)] + + ## Remove author credit + lines = [line for line in lines if not re.compile(r'^[A-Za-z]+ [A-Za-z]+, [0-9]{4}').search(line)] + + with open('../corpus.txt', 'a') as outputfile: + for line in lines: + outputfile.write(line + '\n') + +## Now you have all the sea shanty lines in a single corpus.txt file diff --git a/shantybot.py b/shantybot.py new file mode 100644 index 0000000..6ac0a77 --- /dev/null +++ b/shantybot.py @@ -0,0 +1,105 @@ +import markovify +import os +import re +import textstat +import pronouncing + +os.chdir('/home/bgcarlisle/Projects/Shantybot/') + +with open ('corpus.txt') as corpus: + mmodel = markovify.NewlineText(corpus) + +first_and_third_dont_rhyme = True +second_and_fourth_dont_rhyme = True + +while first_and_third_dont_rhyme or second_and_fourth_dont_rhyme: + + first_line_rhymes_n = 0 + + while first_line_rhymes_n < 6: + + first_line = re.sub(r'[,;]$', '', mmodel.make_sentence()) + + first_line_syllables = textstat.syllable_count(first_line) + + first_line_words = first_line.split(" ") + + first_line_lastword = first_line_words[len(first_line_words)-1] + + first_line_lastword = re.sub(r'[^A-Za-z]', '', first_line_lastword) + + first_line_rhymes = pronouncing.rhymes(first_line_lastword) + + first_line_rhymes_n = len(first_line_rhymes) + + second_line_rhymes_n = 0 + + while second_line_rhymes_n < 6: + + second_line = re.sub(r'[,;]$', '', mmodel.make_sentence()) + + second_line_syllables = textstat.syllable_count(second_line) + + second_line_words = second_line.split(" ") + + second_line_lastword = second_line_words[len(second_line_words)-1] + + second_line_lastword = re.sub(r'[^A-Za-z]', '', second_line_lastword) + + second_line_rhymes = pronouncing.rhymes(second_line_lastword) + + second_line_rhymes_n = len(second_line_rhymes) + + third_line_lastword = "" + third_line_syllables = 0 + third_line_tries = 0 + + while (third_line_syllables != first_line_syllables or first_and_third_dont_rhyme) and third_line_tries < 1000: + + third_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000)) + + third_line_syllables = textstat.syllable_count(third_line) + + third_line_words = third_line.split(" ") + + third_line_lastword = third_line_words[len(third_line_words)-1] + + ## print(first_line_lastword) + ## print(third_line_lastword) + ## print() + + third_line_lastword = re.sub(r'[^A-Za-z]', '', third_line_lastword) + + first_and_third_dont_rhyme = first_line_rhymes.count(third_line_lastword) == 0 + + third_line_tries = third_line_tries + 1 + + fourth_line_lastword = "" + fourth_line_syllables = 0 + fourth_line_tries = 0 + + while (fourth_line_syllables != second_line_syllables or second_and_fourth_dont_rhyme) and fourth_line_tries < 1000: + + fourth_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000)) + + fourth_line_syllables = textstat.syllable_count(fourth_line) + + fourth_line_words = fourth_line.split(" ") + + fourth_line_lastword = fourth_line_words[len(fourth_line_words)-1] + + ## print(second_line_lastword) + ## print(fourth_line_lastword) + ## print() + + fourth_line_lastword = re.sub(r'[^A-Za-z]', '', fourth_line_lastword) + + second_and_fourth_dont_rhyme = second_line_rhymes.count(fourth_line_lastword) == 0 + + fourth_line_tries = fourth_line_tries + 1 + +print(first_line) +print(second_line) +print(third_line) +print(fourth_line) +print()