import re
import urllib.request
import html
import csv
def get_website_content(url):
page = urllib.request.urlopen(url) # open URL
page = page.read() # read URL source code
return page.decode("utf-8") # decode as Unicode text
def get_info_from_website_source(website_text):
""" TODO: extract title, url and first paragraph from website source code """
title = "title"
url = "url"
first_paragraph = "text"
return ([url, title, first_paragraph])
def write_data_to_file(writer, data):
writer.writerow(data)
return
def make_corpus():
with open('corpus.csv', 'w', newline='') as csvfile:
corpuswriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
corpuswriter.writerow(["URL", "Title", "First Paragraph"])
for i in range(100):
p = html.unescape(get_website_content("https://en.wikipedia.org/wiki/Special:Random"))
#print(p)
data = get_info_from_website_source(p)
write_data_to_file(corpuswriter, data)