2 changed files with 156 additions and 0 deletions
@ -0,0 +1,51 @@ |
|||||||
|
## First, use wget to scrape www.jsward.com for sea shanties |
||||||
|
## The copy all the HTML's to html/ |
||||||
|
|
||||||
|
from bs4 import BeautifulSoup |
||||||
|
import os |
||||||
|
import re |
||||||
|
|
||||||
|
os.chdir('/home/bgcarlisle/Projects/Shantybot/') |
||||||
|
files = os.listdir('html') |
||||||
|
os.chdir('html') |
||||||
|
|
||||||
|
for filename in files: |
||||||
|
with open(filename) as shanty: |
||||||
|
print(filename) |
||||||
|
doc = BeautifulSoup(shanty, 'html.parser') |
||||||
|
|
||||||
|
doc.head.decompose() |
||||||
|
|
||||||
|
if doc.find_all("h1"): |
||||||
|
doc.h1.decompose() |
||||||
|
|
||||||
|
if doc.find_all("h2"): |
||||||
|
doc.h2.decompose() |
||||||
|
|
||||||
|
text = doc.body.get_text() |
||||||
|
|
||||||
|
lines = text.split('\n') |
||||||
|
|
||||||
|
## Remove empty lines |
||||||
|
lines = [line for line in lines if line.strip()] |
||||||
|
|
||||||
|
## Remove Copyright lines |
||||||
|
lines = [line for line in lines if not re.compile(r'Copyright').search(line)] |
||||||
|
|
||||||
|
## Remove author credit |
||||||
|
lines = [line for line in lines if not re.compile(r'^By [A-Za-z]+ [A-Za-z]+').search(line)] |
||||||
|
|
||||||
|
## Remove author credit |
||||||
|
lines = [line for line in lines if not re.compile(r'^Lyrics from').search(line)] |
||||||
|
|
||||||
|
## Remove author credit |
||||||
|
lines = [line for line in lines if not re.compile(r'^Traditional').search(line)] |
||||||
|
|
||||||
|
## Remove author credit |
||||||
|
lines = [line for line in lines if not re.compile(r'^[A-Za-z]+ [A-Za-z]+, [0-9]{4}').search(line)] |
||||||
|
|
||||||
|
with open('../corpus.txt', 'a') as outputfile: |
||||||
|
for line in lines: |
||||||
|
outputfile.write(line + '\n') |
||||||
|
|
||||||
|
## Now you have all the sea shanty lines in a single corpus.txt file |
||||||
@ -0,0 +1,105 @@ |
|||||||
|
import markovify |
||||||
|
import os |
||||||
|
import re |
||||||
|
import textstat |
||||||
|
import pronouncing |
||||||
|
|
||||||
|
os.chdir('/home/bgcarlisle/Projects/Shantybot/') |
||||||
|
|
||||||
|
with open ('corpus.txt') as corpus: |
||||||
|
mmodel = markovify.NewlineText(corpus) |
||||||
|
|
||||||
|
first_and_third_dont_rhyme = True |
||||||
|
second_and_fourth_dont_rhyme = True |
||||||
|
|
||||||
|
while first_and_third_dont_rhyme or second_and_fourth_dont_rhyme: |
||||||
|
|
||||||
|
first_line_rhymes_n = 0 |
||||||
|
|
||||||
|
while first_line_rhymes_n < 6: |
||||||
|
|
||||||
|
first_line = re.sub(r'[,;]$', '', mmodel.make_sentence()) |
||||||
|
|
||||||
|
first_line_syllables = textstat.syllable_count(first_line) |
||||||
|
|
||||||
|
first_line_words = first_line.split(" ") |
||||||
|
|
||||||
|
first_line_lastword = first_line_words[len(first_line_words)-1] |
||||||
|
|
||||||
|
first_line_lastword = re.sub(r'[^A-Za-z]', '', first_line_lastword) |
||||||
|
|
||||||
|
first_line_rhymes = pronouncing.rhymes(first_line_lastword) |
||||||
|
|
||||||
|
first_line_rhymes_n = len(first_line_rhymes) |
||||||
|
|
||||||
|
second_line_rhymes_n = 0 |
||||||
|
|
||||||
|
while second_line_rhymes_n < 6: |
||||||
|
|
||||||
|
second_line = re.sub(r'[,;]$', '', mmodel.make_sentence()) |
||||||
|
|
||||||
|
second_line_syllables = textstat.syllable_count(second_line) |
||||||
|
|
||||||
|
second_line_words = second_line.split(" ") |
||||||
|
|
||||||
|
second_line_lastword = second_line_words[len(second_line_words)-1] |
||||||
|
|
||||||
|
second_line_lastword = re.sub(r'[^A-Za-z]', '', second_line_lastword) |
||||||
|
|
||||||
|
second_line_rhymes = pronouncing.rhymes(second_line_lastword) |
||||||
|
|
||||||
|
second_line_rhymes_n = len(second_line_rhymes) |
||||||
|
|
||||||
|
third_line_lastword = "" |
||||||
|
third_line_syllables = 0 |
||||||
|
third_line_tries = 0 |
||||||
|
|
||||||
|
while (third_line_syllables != first_line_syllables or first_and_third_dont_rhyme) and third_line_tries < 1000: |
||||||
|
|
||||||
|
third_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000)) |
||||||
|
|
||||||
|
third_line_syllables = textstat.syllable_count(third_line) |
||||||
|
|
||||||
|
third_line_words = third_line.split(" ") |
||||||
|
|
||||||
|
third_line_lastword = third_line_words[len(third_line_words)-1] |
||||||
|
|
||||||
|
## print(first_line_lastword) |
||||||
|
## print(third_line_lastword) |
||||||
|
## print() |
||||||
|
|
||||||
|
third_line_lastword = re.sub(r'[^A-Za-z]', '', third_line_lastword) |
||||||
|
|
||||||
|
first_and_third_dont_rhyme = first_line_rhymes.count(third_line_lastword) == 0 |
||||||
|
|
||||||
|
third_line_tries = third_line_tries + 1 |
||||||
|
|
||||||
|
fourth_line_lastword = "" |
||||||
|
fourth_line_syllables = 0 |
||||||
|
fourth_line_tries = 0 |
||||||
|
|
||||||
|
while (fourth_line_syllables != second_line_syllables or second_and_fourth_dont_rhyme) and fourth_line_tries < 1000: |
||||||
|
|
||||||
|
fourth_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000)) |
||||||
|
|
||||||
|
fourth_line_syllables = textstat.syllable_count(fourth_line) |
||||||
|
|
||||||
|
fourth_line_words = fourth_line.split(" ") |
||||||
|
|
||||||
|
fourth_line_lastword = fourth_line_words[len(fourth_line_words)-1] |
||||||
|
|
||||||
|
## print(second_line_lastword) |
||||||
|
## print(fourth_line_lastword) |
||||||
|
## print() |
||||||
|
|
||||||
|
fourth_line_lastword = re.sub(r'[^A-Za-z]', '', fourth_line_lastword) |
||||||
|
|
||||||
|
second_and_fourth_dont_rhyme = second_line_rhymes.count(fourth_line_lastword) == 0 |
||||||
|
|
||||||
|
fourth_line_tries = fourth_line_tries + 1 |
||||||
|
|
||||||
|
print(first_line) |
||||||
|
print(second_line) |
||||||
|
print(third_line) |
||||||
|
print(fourth_line) |
||||||
|
print() |
||||||
Loading…
Reference in new issue