import re
import urllib.request
import html
import csv


def get_website_content(url):
    page = urllib.request.urlopen(url)      # open URL
    page = page.read()                      # read URL source code
    return page.decode("utf-8")             # decode as Unicode text

def get_info_from_website_source(website_text):
    title = website_text.split("<title>")[1].split(" - Wikipedia, the free encyclopedia</title>")[0]
    print("Titel:\t", title)
    
    u = re.search("<link rel=\"canonical\" href=\"(https://en\.wikipedia\.org/wiki/.*)\" />", website_text)
    url = u.group(1)
     
    first_paragraph = get_first_paragraph(website_text)
    first_paragraph = re.sub("<[^>]*>","",first_paragraph)
    first_paragraph = re.sub("\[[0-9]\]","",first_paragraph)     
    return ([url, title, first_paragraph])

def get_first_paragraph(text):
    if "<table" in text.split("<p>")[0]:
        if "</table>" not in text.split("<p>")[0]:
            return (get_first_paragraph(text.split("</table>",maxsplit=1)[1]))
        else:
            return (get_first_paragraph(text.split("</table>",maxsplit = text.split("<p>")[0].count("<table"))[-1]))
    else:
         return (text.split("<p>")[1].split("</p>")[0])

def write_data_to_file(writer, data):
    writer.writerow(data)
    return

def make_corpus():
    csvfile = open("corpus.csv", "w", encoding="utf8")
    corpuswriter = csv.writer(csvfile, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    corpuswriter.writerow(["URL", "Title", "First Paragraph"])
    for i in range(100):
        p = html.unescape(get_website_content("https://en.wikipedia.org/wiki/Special:Random"))
        if "<p>" in p:   # check whether the page contains at least one paragraph.
            data = get_info_from_website_source(p)
            write_data_to_file(corpuswriter, data)
    csvfile.close()
