From 2dd06bf461529ec2cbe4b6389dfcc70c8380ccb9 Mon Sep 17 00:00:00 2001
From: Benjamin G Carlisle <murph@bgcarlisle.com>
Date: Thu, 4 Feb 2021 13:50:04 +0100
Subject: [PATCH] Scripts for extracting lines and generating shanties

---
 lyric-extract.py |  51 +++++++++++++++++++++++
 shantybot.py     | 105 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 156 insertions(+)
 create mode 100644 lyric-extract.py
 create mode 100644 shantybot.py

diff --git a/lyric-extract.py b/lyric-extract.py
new file mode 100644
index 0000000..2830611
--- /dev/null
+++ b/lyric-extract.py
@@ -0,0 +1,51 @@
+## First, use wget to scrape www.jsward.com for sea shanties
+## The copy all the HTML's to html/
+
+from bs4 import BeautifulSoup
+import os
+import re
+
+os.chdir('/home/bgcarlisle/Projects/Shantybot/')
+files = os.listdir('html')
+os.chdir('html')
+
+for filename in files:
+    with open(filename) as shanty:
+        print(filename)
+        doc = BeautifulSoup(shanty, 'html.parser')
+
+        doc.head.decompose()
+
+        if doc.find_all("h1"):
+            doc.h1.decompose()
+        
+        if doc.find_all("h2"):
+            doc.h2.decompose()
+
+        text = doc.body.get_text()
+
+        lines = text.split('\n')
+
+        ## Remove empty lines
+        lines = [line for line in lines if line.strip()]
+
+        ## Remove Copyright lines
+        lines = [line for line in lines if not re.compile(r'Copyright').search(line)]
+
+        ## Remove author credit
+        lines = [line for line in lines if not re.compile(r'^By [A-Za-z]+ [A-Za-z]+').search(line)]
+
+        ## Remove author credit
+        lines = [line for line in lines if not re.compile(r'^Lyrics from').search(line)]
+
+        ## Remove author credit
+        lines = [line for line in lines if not re.compile(r'^Traditional').search(line)]
+
+        ## Remove author credit
+        lines = [line for line in lines if not re.compile(r'^[A-Za-z]+ [A-Za-z]+, [0-9]{4}').search(line)]
+
+        with open('../corpus.txt', 'a') as outputfile:
+            for line in lines:
+                outputfile.write(line + '\n')
+
+## Now you have all the sea shanty lines in a single corpus.txt file
diff --git a/shantybot.py b/shantybot.py
new file mode 100644
index 0000000..6ac0a77
--- /dev/null
+++ b/shantybot.py
@@ -0,0 +1,105 @@
+import markovify
+import os
+import re
+import textstat
+import pronouncing
+
+os.chdir('/home/bgcarlisle/Projects/Shantybot/')
+
+with open ('corpus.txt') as corpus:
+    mmodel = markovify.NewlineText(corpus)
+
+first_and_third_dont_rhyme = True
+second_and_fourth_dont_rhyme = True
+    
+while first_and_third_dont_rhyme or second_and_fourth_dont_rhyme:
+
+    first_line_rhymes_n = 0
+
+    while first_line_rhymes_n < 6:
+
+        first_line = re.sub(r'[,;]$', '', mmodel.make_sentence())
+
+        first_line_syllables = textstat.syllable_count(first_line)
+
+        first_line_words = first_line.split(" ")
+
+        first_line_lastword = first_line_words[len(first_line_words)-1]
+
+        first_line_lastword = re.sub(r'[^A-Za-z]', '', first_line_lastword)
+
+        first_line_rhymes = pronouncing.rhymes(first_line_lastword)
+
+        first_line_rhymes_n = len(first_line_rhymes)
+
+    second_line_rhymes_n = 0
+
+    while second_line_rhymes_n < 6:
+
+        second_line = re.sub(r'[,;]$', '', mmodel.make_sentence())
+
+        second_line_syllables = textstat.syllable_count(second_line)
+
+        second_line_words = second_line.split(" ")
+
+        second_line_lastword = second_line_words[len(second_line_words)-1]
+
+        second_line_lastword = re.sub(r'[^A-Za-z]', '', second_line_lastword)
+
+        second_line_rhymes = pronouncing.rhymes(second_line_lastword)
+
+        second_line_rhymes_n = len(second_line_rhymes)
+
+    third_line_lastword = ""
+    third_line_syllables = 0
+    third_line_tries = 0
+
+    while (third_line_syllables != first_line_syllables or first_and_third_dont_rhyme) and third_line_tries < 1000:
+
+        third_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000))
+
+        third_line_syllables = textstat.syllable_count(third_line)
+
+        third_line_words = third_line.split(" ")
+
+        third_line_lastword = third_line_words[len(third_line_words)-1]
+
+        ## print(first_line_lastword)
+        ## print(third_line_lastword)
+        ## print()
+
+        third_line_lastword = re.sub(r'[^A-Za-z]', '', third_line_lastword)
+
+        first_and_third_dont_rhyme = first_line_rhymes.count(third_line_lastword) == 0
+
+        third_line_tries = third_line_tries + 1
+
+    fourth_line_lastword = ""
+    fourth_line_syllables = 0
+    fourth_line_tries = 0
+
+    while (fourth_line_syllables != second_line_syllables or second_and_fourth_dont_rhyme) and fourth_line_tries < 1000:
+
+        fourth_line = re.sub(r'[,;]$', '', mmodel.make_sentence(tries=10000))
+
+        fourth_line_syllables = textstat.syllable_count(fourth_line)
+
+        fourth_line_words = fourth_line.split(" ")
+
+        fourth_line_lastword = fourth_line_words[len(fourth_line_words)-1]
+
+        ## print(second_line_lastword)
+        ## print(fourth_line_lastword)
+        ## print()
+
+        fourth_line_lastword = re.sub(r'[^A-Za-z]', '', fourth_line_lastword)
+
+        second_and_fourth_dont_rhyme = second_line_rhymes.count(fourth_line_lastword) == 0
+
+        fourth_line_tries = fourth_line_tries + 1
+    
+print(first_line)
+print(second_line)
+print(third_line)
+print(fourth_line)
+print()