import re
import urllib.request
import html
import csv

def get_website_content(url):
    page = urllib.request.urlopen(url)      # open URL
    page = page.read()                      # read URL source code
    return page.decode("utf-8")             # decode as Unicode text

def get_info_from_website_source(website_text):
	""" TODO: extract title, url and first paragraph from website source code """
    title = "title"
    url = "url"
    first_paragraph = "text"
    return ([url, title, first_paragraph])

def write_data_to_file(writer, data):
    writer.writerow(data)
    return

def make_corpus():
	with open('corpus.csv', 'w', newline='') as csvfile:
		corpuswriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL)
		corpuswriter.writerow(["URL", "Title", "First Paragraph"])
		for i in range(100):
				p = html.unescape(get_website_content("https://en.wikipedia.org/wiki/Special:Random"))
				#print(p)
				data = get_info_from_website_source(p)
				write_data_to_file(corpuswriter, data)