Browse Source

Scripts for extracting lines and generating shanties

main
Benjamin G Carlisle 5 years ago
parent
commit
2dd06bf461
  1. 51
      lyric-extract.py
  2. 105
      shantybot.py

51
lyric-extract.py

@ -0,0 +1,51 @@
## First, use wget to scrape www.jsward.com for sea shanties
## The copy all the HTML's to html/
from bs4 import BeautifulSoup
import os
import re
os.chdir('/home/bgcarlisle/Projects/Shantybot/')
files = os.listdir('html')
os.chdir('html')
for filename in files:
with open(filename) as shanty:
print(filename)
doc = BeautifulSoup(shanty, 'html.parser')
doc.head.decompose()
if doc.find_all("h1"):
doc.h1.decompose()
if doc.find_all("h2"):
doc.h2.decompose()
text = doc.body.get_text()
lines = text.split('\n')
## Remove empty lines
lines = [line for line in lines if line.strip()]
## Remove Copyright lines
lines = [line for line in lines if not re.compile(r'Copyright').search(line)]
## Remove author credit
lines = [line for line in lines if not re.compile(r'^By [A-Za-z]+ [A-Za-z]+').search(line)]
## Remove author credit
lines = [line for line in lines if not re.compile(r'^Lyrics from').search(line)]
## Remove author credit
lines = [line for line in lines if not re.compile(r'^Traditional').search(line)]
## Remove author credit
lines = [line for line in lines if not re.compile(r'^[A-Za-z]+ [A-Za-z]+, [0-9]{4}').search(line)]
with open('../corpus.txt', 'a') as outputfile:
for line in lines:
outputfile.write(line + '\n')
## Now you have all the sea shanty lines in a single corpus.txt file

105
shantybot.py

@ -0,0 +1,105 @@
import markovify
import os
import re
import textstat
import pronouncing
os.chdir('/home/bgcarlisle/Projects/Shantybot/')
with open ('corpus.txt') as corpus:
mmodel = markovify.NewlineText(corpus)
first_and_third_dont_rhyme = True
second_and_fourth_dont_rhyme = True
while first_and_third_dont_rhyme or second_and_fourth_dont_rhyme:
first_line_rhymes_n = 0
while first_line_rhymes_n < 6:
first_line = re.sub(r'[,;]$', '', mmodel.make_sentence())
first_line_syllables = textstat.syllable_count(first_line)
first_line_words = first_line.split(" ")
first_line_lastword = first_line_words[len(first_line_words)-1]
first_line_lastword = re.sub(r'[^A-Za-z]', '', first_line_lastword)
first_line_rhymes = pronouncing.rhymes(first_line_lastword)
first_line_rhymes_n = len(first_line_rhymes)
second_line_rhymes_n = 0
while second_line_rhymes_n < 6:
second_line = re.sub(r'[,;]$', '', mmodel.make_sentence())
second_line_syllables = textstat.syllable_count(second_line)
second_line_words = second_line.split(" ")
second_line_lastword = second_line_words[len(second_line_words)-1]
second_line_lastword = re.sub(r'[^A-Za-z]', '', second_line_lastword)
second_line_rhymes = pronouncing.rhymes(second_line_lastword)
second_line_rhymes_n = len(second_line_rhymes)
third_line_lastword = ""
third_line_syllables = 0
third_line_tries = 0
while (third_line_syllables != first_line_syllables or first_and_third_dont_rhyme) and third_line_tries < 1000:
third_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000))
third_line_syllables = textstat.syllable_count(third_line)
third_line_words = third_line.split(" ")
third_line_lastword = third_line_words[len(third_line_words)-1]
## print(first_line_lastword)
## print(third_line_lastword)
## print()
third_line_lastword = re.sub(r'[^A-Za-z]', '', third_line_lastword)
first_and_third_dont_rhyme = first_line_rhymes.count(third_line_lastword) == 0
third_line_tries = third_line_tries + 1
fourth_line_lastword = ""
fourth_line_syllables = 0
fourth_line_tries = 0
while (fourth_line_syllables != second_line_syllables or second_and_fourth_dont_rhyme) and fourth_line_tries < 1000:
fourth_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000))
fourth_line_syllables = textstat.syllable_count(fourth_line)
fourth_line_words = fourth_line.split(" ")
fourth_line_lastword = fourth_line_words[len(fourth_line_words)-1]
## print(second_line_lastword)
## print(fourth_line_lastword)
## print()
fourth_line_lastword = re.sub(r'[^A-Za-z]', '', fourth_line_lastword)
second_and_fourth_dont_rhyme = second_line_rhymes.count(fourth_line_lastword) == 0
fourth_line_tries = fourth_line_tries + 1
print(first_line)
print(second_line)
print(third_line)
print(fourth_line)
print()
Loading…
Cancel
Save