Commit 4588a405 authored by Muniza's avatar Muniza
Browse files

Add image recommendations stats notebook

parent 389af004
%% Cell type:code id:d14418ff tags:
``` python
import wmfdata
spark = wmfdata.spark.get_session(
type="yarn-regular",
extra_settings={"spark.sql.shuffle.partitions": 2048}
)
```
%% Output
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
%% Cell type:code id:6e585b05 tags:
``` python
import os
import subprocess
from pyspark.sql import functions as F
from pyspark.sql import types as T
```
%% Cell type:code id:425af15f tags:
``` python
# get all recommendation file paths from hdfs
cmd = "hadoop fs -ls /user/mnz/imagerecs/recs-2022-06-07"
output = subprocess.check_output(cmd, shell=True).decode("utf-8").strip().split("\n")
rec_files = [path.split()[-1] for path in output if path.endswith(".parquet")]
```
%% Cell type:code id:29267692 tags:
``` python
recs_count = {}
for file in rec_files:
image_recommendations_df = spark.read.parquet(file)
# the recommended_images col contains an array of
# maps (wiki_db -> [images]) so we need to explode
# it in order to get to individual maps
exploded_recs_df = image_recommendations_df.select(
"*",
F.explode("recommended_images").alias("images")
)
# explode the maps, aggregate same images across
# all languages for each section and filter out any
# images that occur in less than 5 languages
filtered_recs_df = (
exploded_recs_df
.select(
"item_id",
"target_title",
"target_heading",
F.explode(F.col("images"))
)
.withColumn(
"image",
F.explode(F.col("value")).alias("image")
)
.groupBy(["item_id", "target_title", "target_heading", "image"])
.agg(F.count("*").alias("frequency"))
.filter(F.col("frequency") >= 5)
)
wiki_db = os.path.basename(file).split(".")[0].replace("_recommendations", "")
recs_count[wiki_db] = filtered_recs_df.count()
```
%% Cell type:code id:e6630f38 tags:
``` python
rec_stats = pd.DataFrame.from_dict(recs_count, orient="index", columns=["recommendations"])
rec_stats = rec_stats.sort_index().reset_index().rename(columns={"index": "wiki_db"})
rec_stats = rec_stats[rec_stats["recommendations"] > 0]
```
%% Cell type:code id:a3e281e6 tags:
``` python
print(rec_stats.to_string())
```
%% Output
wiki_db recommendations
0 abwiki 1360
1 acewiki 19
2 adywiki 1332
3 afwiki 6166
5 amwiki 3
6 angwiki 24
7 anwiki 2700
9 arwiki 10214
10 arywiki 184
11 arzwiki 3691
12 avwiki 627
13 aywiki 6
14 azbwiki 955
15 azwiki 8605
16 banwiki 200
17 barwiki 926
18 bawiki 4322
19 bclwiki 237
21 bewiki 9051
22 bgwiki 16926
24 biwiki 4
26 bnwiki 7933
28 bpywiki 754
29 brwiki 354
30 bswiki 8722
31 cawiki 17232
32 cbk_zamwiki 446
34 cebwiki 287
36 chwiki 20
37 ckbwiki 12
38 cowiki 1094
41 cswiki 20171
42 cvwiki 1743
43 cywiki 3393
44 dawiki 11887
45 dewiki 17406
47 diqwiki 719
48 dsbwiki 748
49 dtywiki 134
51 enwiki 16865
52 eswiki 18019
53 etwiki 12326
54 euwiki 12949
55 extwiki 418
56 fawiki 13017
57 ffwiki 30
58 fiwiki 14673
59 frrwiki 981
60 frwiki 22328
61 furwiki 748
62 fywiki 5713
63 gagwiki 59
64 ganwiki 613
65 gdwiki 1224
66 glkwiki 96
67 gnwiki 5
68 gomwiki 140
69 gorwiki 57
72 guwiki 1122
74 hawwiki 35
75 hewiki 13434
76 hifwiki 244
77 hrwiki 11298
78 hsbwiki 487
79 htwiki 1040
80 huwiki 15820
81 hywiki 10558
82 hywwiki 2267
83 iawiki 1453
84 iewiki 896
85 igwiki 31
86 inhwiki 54
87 iowiki 3691
88 iswiki 3374
89 itwiki 23174
90 jawiki 14412
92 jvwiki 1986
93 kaawiki 131
94 kabwiki 11
95 kawiki 7636
96 kbdwiki 75
97 kgwiki 23
99 kkwiki 5696
100 kmwiki 1007
101 kowiki 11440
102 krcwiki 1106
103 kshwiki 13
104 kswiki 2
105 kuwiki 1361
106 kvwiki 241
107 kwwiki 64
108 kywiki 1971
109 lawiki 4851
110 lbewiki 40
111 lbwiki 3406
112 lezwiki 928
113 lfnwiki 946
114 lldwiki 1370
115 lmowiki 1560
116 lnwiki 1577
117 lowiki 301
118 ltwiki 14459
119 lvwiki 10600
121 maiwiki 2163
122 mdfwiki 1754
123 mgwiki 1022
124 minwiki 333
125 mkwiki 8809
126 mlwiki 3667
128 mnwiki 2335
129 mnwwiki 34
130 mrjwiki 1030
131 mrwiki 6291
132 mtwiki 1004
133 mwlwiki 621
134 mywiki 955
135 mznwiki 1848
136 nahwiki 125
138 napwiki 565
140 nds_nlwiki 802
141 newiki 2980
142 newwiki 1821
143 nlwiki 21374
144 nowiki 14115
146 orwiki 660
147 oswiki 678
148 pamwiki 251
149 papwiki 275
150 pawiki 2095
151 pcdwiki 166
152 pdcwiki 33
153 pihwiki 6
154 plwiki 20330
155 pmswiki 1001
156 pnbwiki 2351
157 pswiki 2586
158 ptwiki 18488
159 quwiki 82
160 rmwiki 1128
161 rmywiki 2
162 ruewiki 997
164 rwwiki 13
165 sahwiki 632
167 sawiki 407
168 scnwiki 2563
169 scowiki 1302
170 scwiki 886
171 sdwiki 983
172 sewiki 188
175 shwiki 10593
176 simplewiki 10363
177 siwiki 494
178 skrwiki 172
179 skwiki 12741
180 slwiki 8872
181 smnwiki 275
182 snwiki 41
183 sqwiki 7061
184 srnwiki 17
185 srwiki 14390
186 sswiki 9
187 stwiki 7
188 suwiki 1912
189 swwiki 3335
190 szlwiki 323
191 tawiki 5337
192 tcywiki 84
193 tetwiki 28
194 tgwiki 1267
195 thwiki 8054
196 tlwiki 1897
197 tnwiki 2
198 tpiwiki 40
199 trwiki 16377
201 ttwiki 3588
202 twwiki 4
203 tyvwiki 75
204 ukwiki 21303
205 urwiki 3707
206 uzwiki 5880
207 vecwiki 2774
208 vepwiki 3314
209 vewiki 1
210 viwiki 9965
211 vowiki 16
213 warwiki 978
214 wawiki 80
215 wowiki 12
216 wuuwiki 963
217 xalwiki 102
218 yiwiki 1067
219 yowiki 54
220 zeawiki 380
222 zh_yuewiki 1970
223 zhwiki 12080
%% Cell type:code id:303179b4 tags:
``` python
rec_stats.to_csv("/home/mnz/one_offs/rec_stats_20220607.tsv", sep="\t", index=False)
```
%% Cell type:code id:07f72578 tags:
``` python
```
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment