Commit 7744212b authored by Vahurzpu's avatar Vahurzpu
Browse files

Add some previously missing files

parent dbad39d7
import json
import random
with open('../../data/raw/promo-tagged-pairs.json', 'r') as f:
records = json.load(f)
lines = []
for record in records:
line = '__label__'
line += 'promotional ' if record['tagged_promo'] else 'nonpromotional '
line += record['clean_contents'].replace('\n', ' ')
lines.append(line)
random.shuffle(lines)
cut1 = int(0.7 * len(lines))
cut2 = int(0.9 * len(lines))
def write_lines(name, lines):
with open(f'../../data/interim/promo-tags/{name}.txt', 'w') as f:
f.write('\n'.join(lines))
write_lines('train', lines[:cut1])
write_lines('test', lines[cut1:cut2])
write_lines('dev', lines[cut2:])
\ No newline at end of file
import json
import pandas as pd
ores_scores = []
with open('../../data/interim/ores-scores.json', 'r') as f:
for line in f:
ores_scores.append(json.loads(line))
df_ores_scores = pd.DataFrame(ores_scores)
df_ores_scores = df_ores_scores[df_ores_scores.error.isnull()]
df_ores_scores = df_ores_scores.drop('error', axis=1)
df_judgements = pd.read_csv('../../data/interim/annotated-judgements.csv').drop('Unnamed: 0', axis=1)
df_judgements = df_judgements[df_judgements.revid.isin(df_ores_scores.revid)]
df_judgements.to_csv('../../data/interim/filtered-judgements.csv', index=False)
\ No newline at end of file
import requests
from tqdm import tqdm
import json
import mwparserfromhell as mwp
from mwparserfromhell.wikicode import Wikicode
from typing import List, Tuple
def is_node_normal(node):
return any((
type(node) == mwp.wikicode.Text,
type(node) == mwp.wikicode.Wikilink and ':' not in str(node.title),
type(node) == mwp.wikicode.Tag and str(node.tag) in {'b', 'i', 'em', 'strong'}
))
def textify(node):
if type(node) == mwp.wikicode.Text:
return str(node)
elif type(node) == mwp.wikicode.Wikilink:
return str(node.title)
elif type(node) == mwp.wikicode.Tag:
return str(node.contents)
def get_body_paragraphs(content: str) -> Tuple[List[Wikicode], List[str]]:
paras = content.split('\n\n')
body_paras_code = []
body_paras_text = []
for para in paras:
text = ''.join([textify(node) for node in mwp.parse(para).nodes if is_node_normal(node)]).strip()
if text != '':
body_paras_code.append(para)
body_paras_text.append(text)
return body_paras_code, body_paras_text
raw_query_results = requests.get('https://quarry.wmcloud.org/run/595237/output/0/json').json()
def revision_wikitext(revid) -> str:
return requests.get('https://en.wikipedia.org/w/api.php', params={
'action': 'query',
'prop': 'revisions',
'revids': revid,
'rvslots': '*',
'rvprop': 'content',
'formatversion': 2,
'format': 'json'
}, headers={
'User-Agent': 'DraftAcceptPrediction/0.0 (by User:Vahurzpu; mmwootten@outlook.com)'
}).json()['query']['pages'][0]['revisions'][0]['slots']['main']['content']
def clean_wikitext(wikitext: str) -> str:
body_paras_code, body_paras_text = get_body_paragraphs(wikitext)
return '\n\n'.join(body_paras_text)
def assemble_record(title: str, revid, is_tagged: bool) -> dict:
raw_wikitext = revision_wikitext(revid)
cleaned_text = clean_wikitext(raw_wikitext)
return {
'page': title,
'revid': revid,
'tagged_promo': is_tagged,
'raw_contents': raw_wikitext,
'clean_contents': cleaned_text
}
records = []
for (ts, title, oldid_without_tl, oldid_with_tl, actor) in tqdm(raw_query_results['rows']):
try:
old_record = assemble_record(title, oldid_with_tl, True)
new_record = assemble_record(title, oldid_without_tl, False)
if ('#redirect' in old_record['raw_contents'].lower()) or ('#redirect' in new_record['raw_contents'].lower()):
continue
if old_record['clean_contents'].strip() == '' or new_record['clean_contents'].strip() == '':
continue
records.append(old_record)
records.append(new_record)
except:
pass
with open('../../data/raw/promo-tagged-pairs.json', 'w') as f:
json.dump(records, f)
\ No newline at end of file
import pandas as pd
import requests
import json
from tqdm import tqdm
annotated_judgements = pd.read_csv('../../data/interim/annotated-judgements.csv')
annotated_judgements = annotated_judgements.sample(frac=1).reset_index(drop=True)
complete_revids = set()
with open('../../data/interim/ores-scores.json', 'r') as f:
for line in f:
complete_revids.add(json.loads(line)['revid'])
with open('../../data/interim/ores-scores.json', 'w') as f:
for (_, judgment) in tqdm(list(annotated_judgements.iterrows())):
revid = judgment['revid']
if revid not in complete_revids:
request_url = f'https://ores.wikimedia.org/v3/scores/enwiki/{revid}/articlequality'
ores_response = requests.get(request_url, headers={
'User-Agent': 'DraftAcceptPrediction/0.0 (by User:Vahurzpu; mmwootten@outlook.com)'
}).json()
model_response = next(iter(ores_response['enwiki']['scores'].values()))['articlequality']
if 'error' in model_response:
print(json.dumps({'revid': revid, 'error': model_response['error']['message']}), file=f)
else:
scores = next(iter(ores_response['enwiki']['scores'].values()))['articlequality']['score']['probability']
scores['revid'] = judgment['revid']
print(json.dumps(scores), file=f)
\ No newline at end of file
import requests
import re
import pywikibot
from pathlib import Path
from xml.etree import ElementTree as ET
from tqdm import tqdm
raw_query_results = requests.get('https://quarry.wmcloud.org/run/588744/output/0/json').json()
EDIT_SUMMARY_PATTERN = re.compile('Adding \[\[(.*)\]\] to list of recent AfC creations .*')
USER_AGENT = "DraftAcceptPrediction/0.0 (by User:Vahurzpu; mmwootten@outlook.com)"
enwiki = pywikibot.Site('en', 'wikipedia')
skipped = []
for (timestamp, edit_summary) in tqdm(raw_query_results['rows']):
match = re.fullmatch(EDIT_SUMMARY_PATTERN, edit_summary)
if not match:
continue
article = pywikibot.Page(enwiki, match.group(1))
page_xml_file = Path(f'../../data/raw/accepted-drafts/{article.pageid}.xml')
if page_xml_file.exists():
continue
try:
exported_xml = requests.post('https://en.wikipedia.org/wiki/Special:Export', params={
'pages': article.title(),
'dir': 'desc',
'offset': timestamp
}, headers={
'User-Agent': USER_AGENT
}).text
ns = {'mw': 'http://www.mediawiki.org/xml/export-0.10/'}
page_id = int(ET.fromstring(exported_xml).find('mw:page', ns).find('mw:id', ns).text)
assert page_id == article.pageid
with open(page_xml_file, 'w') as f:
f.write(exported_xml)
except Exception as e:
skipped.append(match.group(1))
print(skipped)
\ No newline at end of file
from pathlib import Path
from xml.etree import ElementTree as ET
import pandas as pd
from tqdm import tqdm
import json
accepted_folder = Path('../../data/raw/accepted-drafts')
declined_folder = Path('../../data/raw/declined-drafts')
# Goal of the algorithm: get a bunch of revision IDs at which a judgement was made.
# Basically: what was
revision_judgements = []
revision_texts = dict()
declined_files = list(declined_folder.iterdir())
accepted_files = list(accepted_folder.iterdir())
files_to_examine = declined_files + accepted_files
categories = ['declined'] * len(declined_files) + ['accepted'] * len(accepted_files)
zipped = list(zip(files_to_examine, categories))
for (filepath, folder) in tqdm(zipped):
raw_xml = ET.parse(filepath)
ns = {'mw': 'http://www.mediawiki.org/xml/export-0.10/'}
page = raw_xml.find('mw:page', ns)
pageid = page.find('mw:id', ns).text
revisions = page.findall('mw:revision', ns)
for revision in revisions:
revid = revision.find('mw:id', ns).text
edit_summary_elem = revision.find('mw:comment', ns)
if edit_summary_elem is not None and edit_summary_elem.text is not None:
edit_summary = edit_summary_elem.text
judgement = None
if 'Publishing accepted' in edit_summary:
judgement = True
elif 'Declining submission' in edit_summary:
judgement = False
elif 'Rejecting submission' in edit_summary:
judgement = False
else:
continue
revision_judgements.append({
'folder': folder,
'pageid': pageid,
'revid': revid,
'judgement': judgement
})
revision_texts[revid] = revision.find('mw:text', ns).text
pd.DataFrame(revision_judgements).to_csv('../../data/interim/annotated-judgements.csv')
with open('../../data/interim/revision-texts.json', 'w') as f:
json.dump(revision_texts, f)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment