mastodon-bot-test/lyric-extract.py

## First, use wget to scrape www.jsward.com for sea shanties
## The copy all the HTML's to html/

from bs4 import BeautifulSoup
import os
import re

os.chdir('/home/bgcarlisle/Projects/Shantybot/')
files = os.listdir('html')
os.chdir('html')

for filename in files:
    with open(filename) as shanty:
        print(filename)
        doc = BeautifulSoup(shanty, 'html.parser')

        doc.head.decompose()

        if doc.find_all("h1"):
            doc.h1.decompose()

        if doc.find_all("h2"):
            doc.h2.decompose()

        text = doc.body.get_text()

        lines = text.split('\n')

        ## Remove empty lines
        lines = [line for line in lines if line.strip()]

        ## Remove Copyright lines
        lines = [line for line in lines if not re.compile(r'Copyright').search(line)]

        ## Remove author credit
        lines = [line for line in lines if not re.compile(r'^By [A-Za-z]+ [A-Za-z]+').search(line)]

        ## Remove author credit
        lines = [line for line in lines if not re.compile(r'^Lyrics from').search(line)]

        ## Remove author credit
        lines = [line for line in lines if not re.compile(r'^Traditional').search(line)]

        ## Remove author credit
        lines = [line for line in lines if not re.compile(r'^[A-Za-z]+ [A-Za-z]+, [0-9]{4}').search(line)]

        with open('../corpus.txt', 'a') as outputfile:
            for line in lines:
                outputfile.write(line + '\n')

## Now you have all the sea shanty lines in a single corpus.txt file