You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
51 lines
1.5 KiB
51 lines
1.5 KiB
## First, use wget to scrape www.jsward.com for sea shanties |
|
## The copy all the HTML's to html/ |
|
|
|
from bs4 import BeautifulSoup |
|
import os |
|
import re |
|
|
|
os.chdir('/home/bgcarlisle/Projects/Shantybot/') |
|
files = os.listdir('html') |
|
os.chdir('html') |
|
|
|
for filename in files: |
|
with open(filename) as shanty: |
|
print(filename) |
|
doc = BeautifulSoup(shanty, 'html.parser') |
|
|
|
doc.head.decompose() |
|
|
|
if doc.find_all("h1"): |
|
doc.h1.decompose() |
|
|
|
if doc.find_all("h2"): |
|
doc.h2.decompose() |
|
|
|
text = doc.body.get_text() |
|
|
|
lines = text.split('\n') |
|
|
|
## Remove empty lines |
|
lines = [line for line in lines if line.strip()] |
|
|
|
## Remove Copyright lines |
|
lines = [line for line in lines if not re.compile(r'Copyright').search(line)] |
|
|
|
## Remove author credit |
|
lines = [line for line in lines if not re.compile(r'^By [A-Za-z]+ [A-Za-z]+').search(line)] |
|
|
|
## Remove author credit |
|
lines = [line for line in lines if not re.compile(r'^Lyrics from').search(line)] |
|
|
|
## Remove author credit |
|
lines = [line for line in lines if not re.compile(r'^Traditional').search(line)] |
|
|
|
## Remove author credit |
|
lines = [line for line in lines if not re.compile(r'^[A-Za-z]+ [A-Za-z]+, [0-9]{4}').search(line)] |
|
|
|
with open('../corpus.txt', 'a') as outputfile: |
|
for line in lines: |
|
outputfile.write(line + '\n') |
|
|
|
## Now you have all the sea shanty lines in a single corpus.txt file
|
|
|