2 changed files with 156 additions and 0 deletions
@ -0,0 +1,51 @@
|
||||
## First, use wget to scrape www.jsward.com for sea shanties |
||||
## The copy all the HTML's to html/ |
||||
|
||||
from bs4 import BeautifulSoup |
||||
import os |
||||
import re |
||||
|
||||
os.chdir('/home/bgcarlisle/Projects/Shantybot/') |
||||
files = os.listdir('html') |
||||
os.chdir('html') |
||||
|
||||
for filename in files: |
||||
with open(filename) as shanty: |
||||
print(filename) |
||||
doc = BeautifulSoup(shanty, 'html.parser') |
||||
|
||||
doc.head.decompose() |
||||
|
||||
if doc.find_all("h1"): |
||||
doc.h1.decompose() |
||||
|
||||
if doc.find_all("h2"): |
||||
doc.h2.decompose() |
||||
|
||||
text = doc.body.get_text() |
||||
|
||||
lines = text.split('\n') |
||||
|
||||
## Remove empty lines |
||||
lines = [line for line in lines if line.strip()] |
||||
|
||||
## Remove Copyright lines |
||||
lines = [line for line in lines if not re.compile(r'Copyright').search(line)] |
||||
|
||||
## Remove author credit |
||||
lines = [line for line in lines if not re.compile(r'^By [A-Za-z]+ [A-Za-z]+').search(line)] |
||||
|
||||
## Remove author credit |
||||
lines = [line for line in lines if not re.compile(r'^Lyrics from').search(line)] |
||||
|
||||
## Remove author credit |
||||
lines = [line for line in lines if not re.compile(r'^Traditional').search(line)] |
||||
|
||||
## Remove author credit |
||||
lines = [line for line in lines if not re.compile(r'^[A-Za-z]+ [A-Za-z]+, [0-9]{4}').search(line)] |
||||
|
||||
with open('../corpus.txt', 'a') as outputfile: |
||||
for line in lines: |
||||
outputfile.write(line + '\n') |
||||
|
||||
## Now you have all the sea shanty lines in a single corpus.txt file |
||||
@ -0,0 +1,105 @@
|
||||
import markovify |
||||
import os |
||||
import re |
||||
import textstat |
||||
import pronouncing |
||||
|
||||
os.chdir('/home/bgcarlisle/Projects/Shantybot/') |
||||
|
||||
with open ('corpus.txt') as corpus: |
||||
mmodel = markovify.NewlineText(corpus) |
||||
|
||||
first_and_third_dont_rhyme = True |
||||
second_and_fourth_dont_rhyme = True |
||||
|
||||
while first_and_third_dont_rhyme or second_and_fourth_dont_rhyme: |
||||
|
||||
first_line_rhymes_n = 0 |
||||
|
||||
while first_line_rhymes_n < 6: |
||||
|
||||
first_line = re.sub(r'[,;]$', '', mmodel.make_sentence()) |
||||
|
||||
first_line_syllables = textstat.syllable_count(first_line) |
||||
|
||||
first_line_words = first_line.split(" ") |
||||
|
||||
first_line_lastword = first_line_words[len(first_line_words)-1] |
||||
|
||||
first_line_lastword = re.sub(r'[^A-Za-z]', '', first_line_lastword) |
||||
|
||||
first_line_rhymes = pronouncing.rhymes(first_line_lastword) |
||||
|
||||
first_line_rhymes_n = len(first_line_rhymes) |
||||
|
||||
second_line_rhymes_n = 0 |
||||
|
||||
while second_line_rhymes_n < 6: |
||||
|
||||
second_line = re.sub(r'[,;]$', '', mmodel.make_sentence()) |
||||
|
||||
second_line_syllables = textstat.syllable_count(second_line) |
||||
|
||||
second_line_words = second_line.split(" ") |
||||
|
||||
second_line_lastword = second_line_words[len(second_line_words)-1] |
||||
|
||||
second_line_lastword = re.sub(r'[^A-Za-z]', '', second_line_lastword) |
||||
|
||||
second_line_rhymes = pronouncing.rhymes(second_line_lastword) |
||||
|
||||
second_line_rhymes_n = len(second_line_rhymes) |
||||
|
||||
third_line_lastword = "" |
||||
third_line_syllables = 0 |
||||
third_line_tries = 0 |
||||
|
||||
while (third_line_syllables != first_line_syllables or first_and_third_dont_rhyme) and third_line_tries < 1000: |
||||
|
||||
third_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000)) |
||||
|
||||
third_line_syllables = textstat.syllable_count(third_line) |
||||
|
||||
third_line_words = third_line.split(" ") |
||||
|
||||
third_line_lastword = third_line_words[len(third_line_words)-1] |
||||
|
||||
## print(first_line_lastword) |
||||
## print(third_line_lastword) |
||||
## print() |
||||
|
||||
third_line_lastword = re.sub(r'[^A-Za-z]', '', third_line_lastword) |
||||
|
||||
first_and_third_dont_rhyme = first_line_rhymes.count(third_line_lastword) == 0 |
||||
|
||||
third_line_tries = third_line_tries + 1 |
||||
|
||||
fourth_line_lastword = "" |
||||
fourth_line_syllables = 0 |
||||
fourth_line_tries = 0 |
||||
|
||||
while (fourth_line_syllables != second_line_syllables or second_and_fourth_dont_rhyme) and fourth_line_tries < 1000: |
||||
|
||||
fourth_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000)) |
||||
|
||||
fourth_line_syllables = textstat.syllable_count(fourth_line) |
||||
|
||||
fourth_line_words = fourth_line.split(" ") |
||||
|
||||
fourth_line_lastword = fourth_line_words[len(fourth_line_words)-1] |
||||
|
||||
## print(second_line_lastword) |
||||
## print(fourth_line_lastword) |
||||
## print() |
||||
|
||||
fourth_line_lastword = re.sub(r'[^A-Za-z]', '', fourth_line_lastword) |
||||
|
||||
second_and_fourth_dont_rhyme = second_line_rhymes.count(fourth_line_lastword) == 0 |
||||
|
||||
fourth_line_tries = fourth_line_tries + 1 |
||||
|
||||
print(first_line) |
||||
print(second_line) |
||||
print(third_line) |
||||
print(fourth_line) |
||||
print() |
||||
Loading…
Reference in new issue