You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

51 lines
1.5 KiB

## First, use wget to scrape www.jsward.com for sea shanties
## The copy all the HTML's to html/
from bs4 import BeautifulSoup
import os
import re
os.chdir('/home/bgcarlisle/Projects/Shantybot/')
files = os.listdir('html')
os.chdir('html')
for filename in files:
with open(filename) as shanty:
print(filename)
doc = BeautifulSoup(shanty, 'html.parser')
doc.head.decompose()
if doc.find_all("h1"):
doc.h1.decompose()
if doc.find_all("h2"):
doc.h2.decompose()
text = doc.body.get_text()
lines = text.split('\n')
## Remove empty lines
lines = [line for line in lines if line.strip()]
## Remove Copyright lines
lines = [line for line in lines if not re.compile(r'Copyright').search(line)]
## Remove author credit
lines = [line for line in lines if not re.compile(r'^By [A-Za-z]+ [A-Za-z]+').search(line)]
## Remove author credit
lines = [line for line in lines if not re.compile(r'^Lyrics from').search(line)]
## Remove author credit
lines = [line for line in lines if not re.compile(r'^Traditional').search(line)]
## Remove author credit
lines = [line for line in lines if not re.compile(r'^[A-Za-z]+ [A-Za-z]+, [0-9]{4}').search(line)]
with open('../corpus.txt', 'a') as outputfile:
for line in lines:
outputfile.write(line + '\n')
## Now you have all the sea shanty lines in a single corpus.txt file