Commit d76ceb45 authored by Vahurzpu's avatar Vahurzpu
Browse files

Finish up round 1 of the project

parent 8b479574
- ORES
- P(Stub), P(Start), etc.
- General
- Number of paragraphs
- Number of words
- Number of characters
- Citation-related
- Generic
- Number of cited paragraphs
- Number of uncited paragraphs
- Number of overall citations
- Average number of citations per paragraph/word
- Longest continuous stretch of citations
- Specifically quality-related
- Total number of green citations
- Total number of red citations
- Number of paragraphs with green citations
- Number of paragraphs with red citations
\ No newline at end of file
import mwparserfromhell as mwp
from mwparserfromhell.wikicode import Wikicode
import json
import pandas as pd
from tqdm import trange
from typing import List, Tuple
import fasttext
# ---------
# Load data
# ---------
ores_scores = []
with open('../../data/interim/ores-scores.json', 'r') as f:
for line in f:
ores_scores.append(json.loads(line))
df_ores_scores = pd.DataFrame(ores_scores)
df_ores_scores = df_ores_scores[df_ores_scores.error.isnull()]
df_ores_scores = df_ores_scores.drop('error', axis=1)
df_ores_scores = df_ores_scores.set_index('revid')
df_judgements = pd.read_csv('../../data/interim/filtered-judgements.csv')
with open('../../data/interim/revision-texts.json', 'r') as f:
raw_revisions = json.load(f)
with open('../../data/external/reliability/citehighlighter.json', 'r') as f:
source_categories = json.load(f)
promo_model = fasttext.load_model('../../models/promotional/fasttext.bin')
# ----------------------------
# Helper and feature functions
# ----------------------------
# Paragraph-related preprocessing
def is_node_normal(node):
return any((
type(node) == mwp.wikicode.Text,
type(node) == mwp.wikicode.Wikilink and ':' not in str(node.title),
type(node) == mwp.wikicode.Tag and str(node.tag) in {'b', 'i', 'em', 'strong'}
))
def textify(node):
if type(node) == mwp.wikicode.Text:
return str(node)
elif type(node) == mwp.wikicode.Wikilink:
return str(node.title)
elif type(node) == mwp.wikicode.Tag:
return str(node.contents)
def get_body_paragraphs(content: str) -> Tuple[List[Wikicode], List[str]]:
paras = content.split('\n\n')
body_paras_code = []
body_paras_text = []
for para in paras:
parsed = mwp.parse(para)
text = ''.join([textify(node) for node in parsed.nodes if is_node_normal(node)]).strip()
if text != '':
body_paras_code.append(parsed)
body_paras_text.append(text)
return body_paras_code, body_paras_text
# General features
def general_features(body_paras_text: List[str]) -> dict:
return {
'general.count.paras': len(body_paras_text),
'general.count.words': sum(len(para.split()) for para in body_paras_text),
'general.count.chars': sum(len(para) for para in body_paras_text)
}
# ORES-related
def ores_features(revision_num: int) -> dict:
return {'ores.' + k.lower(): v for (k, v) in dict(df_ores_scores.loc[revision_num]).items()}
# Referencing-related
def get_citations(parsed: Wikicode) -> List[Wikicode]:
return [tag for tag in parsed.filter_tags() if tag.tag == 'ref']
def get_citation_category(citation: Wikicode) -> str:
"""
Get a citation's reliability category according to Novem Linguae's CiteHighlighter list
If it falls into more than one (which should be rare), return the first
"""
# TODO: Use a more sophisticated citation parser
# TODO: incorporate source assessments from other places (like JCW and unreliable.js)
for (category, url_fragments) in source_categories.items():
for url_fragment in url_fragments:
if url_fragment in str(citation):
return category
return 'unclassifiable'
def longest_consecutive_run_citations(parsed: Wikicode) -> int:
"""
Get the longest consecutive run of citations.
If this number is too high, it's a likely indication of refbombing
"""
longest_run = 0
current_run = 0
for piece in parsed.nodes:
if str(piece).strip() == '':
continue
if type(piece) == mwp.wikicode.Tag and piece.tag == 'ref':
current_run += 1
else:
if current_run > longest_run:
longest_run = current_run
current_run = 0
return longest_run
def generic_citation_features(body_paras_code: List[Wikicode], body_paras_text: List[str]) -> dict:
total_word_count = sum(len(para.split()) for para in body_paras_text)
citation_count_per_paragraph = []
for para in body_paras_code:
citation_count_per_paragraph.append(len(get_citations(para)))
return {
'refs.cited-paragraphs': sum(1 for count in citation_count_per_paragraph if count > 0),
'refs.uncited-paragraphs': sum(1 for count in citation_count_per_paragraph if count == 0),
'refs.total-count': sum(citation_count_per_paragraph),
'refs.citations-per-para': sum(citation_count_per_paragraph) / len(body_paras_code) if len(body_paras_code) > 0 else 0,
'refs.citations-per-word': sum(citation_count_per_paragraph) / total_word_count if total_word_count > 0 else 0,
'refs.longest-stretch': max((longest_consecutive_run_citations(para) for para in body_paras_code), default=0),
}
def reliability_citation_features(body_paras_code: List[Wikicode], body_paras_text: List[str]) -> dict:
citations_by_type = dict()
for cite_cat in source_categories.keys():
citations_by_type[cite_cat] = [0] * len(body_paras_code)
citations_by_type['unclassifiable'] = [0] * len(body_paras_code)
for (i, para) in enumerate(body_paras_code):
for citation in get_citations(para):
citation_cat = get_citation_category(citation)
simplification = {
"aggregator": "unclassifiable",
"doi": "green",
"green": "green",
"medrs": "green",
"preprint": "unclassifiable",
"red": "red",
"yellow": "unclassifiable",
"unclassifiable": "unclassifiable"
}
citation_cat = simplification[citation_cat]
citations_by_type[citation_cat][i] += 1
return {
'refs.reliability.total.green': sum(citations_by_type['green']),
'refs.reliability.total.red': sum(citations_by_type['red']),
'refs.reliability.total.unclass': sum(citations_by_type['unclassifiable']),
'refs.reliability.paras-with.green': sum(1 for count in citations_by_type['green'] if count > 0),
'refs.reliability.paras-with.red': sum(1 for count in citations_by_type['red'] if count > 0),
}
# Promotional-related
def promotional_features(body_paras_text: List[str]) -> float:
all_together = ' '.join(body_paras_text).replace('\n', ' ')
(label,), (score,) = promo_model.predict(all_together)
return {
'promo.score': (score if label == '__label__promotional' else 1 - score)
}
# -------------------------------
# Calculate and save the features
# -------------------------------
df_judgements = df_judgements.sample(frac=1)
with open('../../data/processed/features.jsonl', 'w') as f:
for i in trange(len(df_judgements.revid)):
row = df_judgements.iloc[i]
revision_text = raw_revisions[str(row.revid)]
body_paras_code, body_paras_text = get_body_paragraphs(revision_text)
features = dict()
features.update(general_features(body_paras_text))
features.update(ores_features(int(row.revid)))
features.update(generic_citation_features(body_paras_code, body_paras_text))
features.update(reliability_citation_features(body_paras_code, body_paras_text))
features.update(promotional_features(body_paras_text))
features['revid'] = int(row.revid)
features['judgement'] = bool(row.judgement)
print(json.dumps(features), file=f)
import json
import pandas as pd
all_records = []
with open('../../data/processed/features.jsonl', 'r') as f:
for line in f:
all_records.append(json.loads(line))
pd.DataFrame(all_records).to_csv('../../data/processed/features.csv')
import fasttext
model = fasttext.train_supervised('../../../data/interim/promo-tags/train.txt', epoch=50, lr=0.3)
model.save_model("../../../models/promotional/fasttext.bin")
\ No newline at end of file
library(C50)
library(kernlab)
setwd("~/Projects/wikipedia/promising-drafts/data/processed/")
data <- read.csv("features.csv")
data <- subset(data, select=-c(revid, X))
baseline_data <- subset(data, select=c(
ores.stub,
ores.start,
ores.c,
ores.b,
ores.ga,
ores.fa,
judgement
))
train_sample <- sample(nrow(data), as.integer(0.8 * nrow(data)))
baseline_train <- baseline_data[train_sample, ]
baseline_test <- baseline_data[-train_sample, ]
data_train <- data[train_sample, ]
data_test <- data[-train_sample, ]
svm_baseline <- ksvm(as.factor(judgement) ~ ., data = baseline_train, kernel = "vanilladot")
svm_expanded <- ksvm(as.factor(judgement) ~ ., data = data_train, kernel = "vanilladot")
xs_baseline <- subset(baseline_train, select=-c(judgement))
ys_baseline <- as.factor(baseline_train$judgement)
xs_expanded <- subset(data_train, select=-c(judgement))
ys_expanded <- as.factor(data_train$judgement)
trees_baseline <- C5.0(xs_baseline, ys_baseline, trials = 100)
trees_expanded <- C5.0(xs_expanded, ys_expanded, trials = 100)
accuracy_svm_baseline <- sum(predict(svm_baseline, baseline_test) == baseline_test$judgement) / nrow(baseline_test)
accuracy_svm_expanded <- sum(predict(svm_expanded, data_test) == data_test$judgement) / nrow(data_test)
accuracy_trees_baseline <- sum(predict(trees_baseline, baseline_test) == baseline_test$judgement) / nrow(baseline_test)
accuracy_trees_expanded <- sum(predict(trees_expanded, data_test) == data_test$judgement) / nrow(data_test)
c(accuracy_svm_baseline, accuracy_svm_expanded)
c(accuracy_trees_baseline, accuracy_trees_expanded)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment