")[1].split(" - Wikipedia, the free encyclopedia

import re import urllib.request import html import csv def get_website_content(url): page = urllib.request.urlopen(url) # open URL page = page.read() # read URL source code return page.decode("utf-8") # decode as Unicode text def get_info_from_website_source(website_text): title = website_text.split("")[1].split(" - Wikipedia, the free encyclopedia")[0] print("Titel:\t", title) u = re.search("", website_text) url = u.group(1) first_paragraph = get_first_paragraph(website_text) first_paragraph = re.sub("<[^>]*>","",first_paragraph) first_paragraph = re.sub("\[[0-9]\]","",first_paragraph) return ([url, title, first_paragraph]) def get_first_paragraph(text): if "")[0]: if "" not in text.split("

")[0]: return (get_first_paragraph(text.split("",maxsplit=1)[1])) else: return (get_first_paragraph(text.split("",maxsplit = text.split("

")[0].count("")[1].split("

")[0]) def write_data_to_file(writer, data): writer.writerow(data) return def make_corpus(): csvfile = open("corpus.csv", "w", encoding="utf8") corpuswriter = csv.writer(csvfile, delimiter='\t', quotechar='|', quoting=csv.QUOTE_MINIMAL) corpuswriter.writerow(["URL", "Title", "First Paragraph"]) for i in range(100): p = html.unescape(get_website_content("https://en.wikipedia.org/wiki/Special:Random")) if "

" in p: # check whether the page contains at least one paragraph. data = get_info_from_website_source(p) write_data_to_file(corpuswriter, data) csvfile.close()