Commit 8b479574 authored by Vahurzpu's avatar Vahurzpu
Browse files

Initial commit

parents
- Retrieve data
- Draft component
- Accepted drafts fished out of the page history of Wikipedia:Articles for creation/recent
- Rejected drafts fished out of the rejected drafts category
- Look through page history to keep track of resubmits
- Promotional component
- Look through recent changes to detect things entering/leaving Category:All articles with a promotional tone
- Create a cleanly paired dataset of promotional/not from that (with some filtering to detect multiple-revision changes)
- Reference quality component
- Create a JSON version based on the things in https://en.wikipedia.org/wiki/User:Vahurzpu/Source_reliability_information
- Baseline calculations
- Use ORES to get draftquality scores of the various revisions
- Train a very simple model to get P(accept) from draftquality
\ No newline at end of file
from re import sub
from typing import List
import requests
import pywikibot
from xml.etree import ElementTree as ET
from tqdm import tqdm
from pathlib import Path
USER_AGENT = "DraftAcceptPrediction/0.0 (by User:Vahurzpu; mmwootten@outlook.com)"
enwiki = pywikibot.Site('en', 'wikipedia')
declined_submissions = pywikibot.Category(enwiki, 'Category:Declined AfC submissions').articles()
declined_submissions: List[pywikibot.Page] = [page for page in declined_submissions if page.namespace() != enwiki.namespace(14)]
for submission in tqdm(declined_submissions):
page_xml_file = Path(f'../../data/raw/declined-drafts/{submission.pageid}.xml')
if page_xml_file.exists():
continue
try:
exported_xml = requests.get('https://en.wikipedia.org/wiki/Special:Export', params={
'pages': submission.title(),
'history': 1
}, headers={
'User-Agent': USER_AGENT
}).text
ns = {'mw': 'http://www.mediawiki.org/xml/export-0.10/'}
page_id = int(ET.fromstring(exported_xml).find('mw:page', ns).find('mw:id', ns).text)
assert page_id == submission.pageid
with open(page_xml_file, 'w') as f:
f.write(exported_xml)
except Exception as e:
print(submission.pageid, submission.title())
print(e)
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
ores_scores = []
with open('../../../data/interim/ores-scores.json', 'r') as f:
for line in f:
ores_scores.append(json.loads(line))
df_ores_scores = pd.DataFrame(ores_scores)
df_ores_scores = df_ores_scores[df_ores_scores.error.isnull()]
df_ores_scores = df_ores_scores.drop('error', axis=1)
df_judgements = pd.read_csv('../../../data/interim/annotated-judgements.csv').drop('Unnamed: 0', axis=1)
df_merged = pd.merge(df_judgements, df_ores_scores, on='revid')
Xs = np.array(df_merged[['Stub', 'Start', 'C', 'B', 'GA', 'FA']])
Ys = np.array(df_merged['judgement'])
Xs_train, Xs_test, Ys_train, Ys_test = train_test_split(Xs, Ys, test_size=0.2)
model = LogisticRegression().fit(Xs_train, Ys_train)
Ys_pred = model.predict(Xs_test)
print(accuracy_score(Ys_test, Ys_pred))
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment