Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Muniza
One Offs
Commits
4588a405
Commit
4588a405
authored
Aug 15, 2022
by
Muniza
Browse files
Add image recommendations stats notebook
parent
389af004
Changes
1
Hide whitespace changes
Inline
Side-by-side
notebooks/image_rec_stats.ipynb
0 → 100644
View file @
4588a405
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "d14418ff",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.\n"
]
}
],
"source": [
"import wmfdata\n",
"\n",
"spark = wmfdata.spark.get_session(\n",
" type=\"yarn-regular\", \n",
" extra_settings={\"spark.sql.shuffle.partitions\": 2048}\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "6e585b05",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import subprocess\n",
"from pyspark.sql import functions as F\n",
"from pyspark.sql import types as T"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "425af15f",
"metadata": {},
"outputs": [],
"source": [
"# get all recommendation file paths from hdfs\n",
"cmd = \"hadoop fs -ls /user/mnz/imagerecs/recs-2022-06-07\"\n",
"output = subprocess.check_output(cmd, shell=True).decode(\"utf-8\").strip().split(\"\\n\")\n",
"rec_files = [path.split()[-1] for path in output if path.endswith(\".parquet\")]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "29267692",
"metadata": {},
"outputs": [],
"source": [
"recs_count = {}\n",
"\n",
"for file in rec_files:\n",
" image_recommendations_df = spark.read.parquet(file)\n",
" \n",
" # the recommended_images col contains an array of\n",
" # maps (wiki_db -> [images]) so we need to explode\n",
" # it in order to get to individual maps\n",
" exploded_recs_df = image_recommendations_df.select(\n",
" \"*\",\n",
" F.explode(\"recommended_images\").alias(\"images\")\n",
" )\n",
" \n",
" # explode the maps, aggregate same images across\n",
" # all languages for each section and filter out any\n",
" # images that occur in less than 5 languages\n",
" filtered_recs_df = (\n",
" exploded_recs_df\n",
" .select(\n",
" \"item_id\", \n",
" \"target_title\", \n",
" \"target_heading\",\n",
" F.explode(F.col(\"images\"))\n",
" )\n",
" .withColumn(\n",
" \"image\",\n",
" F.explode(F.col(\"value\")).alias(\"image\")\n",
" )\n",
" .groupBy([\"item_id\", \"target_title\", \"target_heading\", \"image\"])\n",
" .agg(F.count(\"*\").alias(\"frequency\"))\n",
" .filter(F.col(\"frequency\") >= 5)\n",
" )\n",
" wiki_db = os.path.basename(file).split(\".\")[0].replace(\"_recommendations\", \"\")\n",
" recs_count[wiki_db] = filtered_recs_df.count()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "e6630f38",
"metadata": {},
"outputs": [],
"source": [
"rec_stats = pd.DataFrame.from_dict(recs_count, orient=\"index\", columns=[\"recommendations\"])\n",
"rec_stats = rec_stats.sort_index().reset_index().rename(columns={\"index\": \"wiki_db\"})\n",
"rec_stats = rec_stats[rec_stats[\"recommendations\"] > 0]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "a3e281e6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" wiki_db recommendations\n",
"0 abwiki 1360\n",
"1 acewiki 19\n",
"2 adywiki 1332\n",
"3 afwiki 6166\n",
"5 amwiki 3\n",
"6 angwiki 24\n",
"7 anwiki 2700\n",
"9 arwiki 10214\n",
"10 arywiki 184\n",
"11 arzwiki 3691\n",
"12 avwiki 627\n",
"13 aywiki 6\n",
"14 azbwiki 955\n",
"15 azwiki 8605\n",
"16 banwiki 200\n",
"17 barwiki 926\n",
"18 bawiki 4322\n",
"19 bclwiki 237\n",
"21 bewiki 9051\n",
"22 bgwiki 16926\n",
"24 biwiki 4\n",
"26 bnwiki 7933\n",
"28 bpywiki 754\n",
"29 brwiki 354\n",
"30 bswiki 8722\n",
"31 cawiki 17232\n",
"32 cbk_zamwiki 446\n",
"34 cebwiki 287\n",
"36 chwiki 20\n",
"37 ckbwiki 12\n",
"38 cowiki 1094\n",
"41 cswiki 20171\n",
"42 cvwiki 1743\n",
"43 cywiki 3393\n",
"44 dawiki 11887\n",
"45 dewiki 17406\n",
"47 diqwiki 719\n",
"48 dsbwiki 748\n",
"49 dtywiki 134\n",
"51 enwiki 16865\n",
"52 eswiki 18019\n",
"53 etwiki 12326\n",
"54 euwiki 12949\n",
"55 extwiki 418\n",
"56 fawiki 13017\n",
"57 ffwiki 30\n",
"58 fiwiki 14673\n",
"59 frrwiki 981\n",
"60 frwiki 22328\n",
"61 furwiki 748\n",
"62 fywiki 5713\n",
"63 gagwiki 59\n",
"64 ganwiki 613\n",
"65 gdwiki 1224\n",
"66 glkwiki 96\n",
"67 gnwiki 5\n",
"68 gomwiki 140\n",
"69 gorwiki 57\n",
"72 guwiki 1122\n",
"74 hawwiki 35\n",
"75 hewiki 13434\n",
"76 hifwiki 244\n",
"77 hrwiki 11298\n",
"78 hsbwiki 487\n",
"79 htwiki 1040\n",
"80 huwiki 15820\n",
"81 hywiki 10558\n",
"82 hywwiki 2267\n",
"83 iawiki 1453\n",
"84 iewiki 896\n",
"85 igwiki 31\n",
"86 inhwiki 54\n",
"87 iowiki 3691\n",
"88 iswiki 3374\n",
"89 itwiki 23174\n",
"90 jawiki 14412\n",
"92 jvwiki 1986\n",
"93 kaawiki 131\n",
"94 kabwiki 11\n",
"95 kawiki 7636\n",
"96 kbdwiki 75\n",
"97 kgwiki 23\n",
"99 kkwiki 5696\n",
"100 kmwiki 1007\n",
"101 kowiki 11440\n",
"102 krcwiki 1106\n",
"103 kshwiki 13\n",
"104 kswiki 2\n",
"105 kuwiki 1361\n",
"106 kvwiki 241\n",
"107 kwwiki 64\n",
"108 kywiki 1971\n",
"109 lawiki 4851\n",
"110 lbewiki 40\n",
"111 lbwiki 3406\n",
"112 lezwiki 928\n",
"113 lfnwiki 946\n",
"114 lldwiki 1370\n",
"115 lmowiki 1560\n",
"116 lnwiki 1577\n",
"117 lowiki 301\n",
"118 ltwiki 14459\n",
"119 lvwiki 10600\n",
"121 maiwiki 2163\n",
"122 mdfwiki 1754\n",
"123 mgwiki 1022\n",
"124 minwiki 333\n",
"125 mkwiki 8809\n",
"126 mlwiki 3667\n",
"128 mnwiki 2335\n",
"129 mnwwiki 34\n",
"130 mrjwiki 1030\n",
"131 mrwiki 6291\n",
"132 mtwiki 1004\n",
"133 mwlwiki 621\n",
"134 mywiki 955\n",
"135 mznwiki 1848\n",
"136 nahwiki 125\n",
"138 napwiki 565\n",
"140 nds_nlwiki 802\n",
"141 newiki 2980\n",
"142 newwiki 1821\n",
"143 nlwiki 21374\n",
"144 nowiki 14115\n",
"146 orwiki 660\n",
"147 oswiki 678\n",
"148 pamwiki 251\n",
"149 papwiki 275\n",
"150 pawiki 2095\n",
"151 pcdwiki 166\n",
"152 pdcwiki 33\n",
"153 pihwiki 6\n",
"154 plwiki 20330\n",
"155 pmswiki 1001\n",
"156 pnbwiki 2351\n",
"157 pswiki 2586\n",
"158 ptwiki 18488\n",
"159 quwiki 82\n",
"160 rmwiki 1128\n",
"161 rmywiki 2\n",
"162 ruewiki 997\n",
"164 rwwiki 13\n",
"165 sahwiki 632\n",
"167 sawiki 407\n",
"168 scnwiki 2563\n",
"169 scowiki 1302\n",
"170 scwiki 886\n",
"171 sdwiki 983\n",
"172 sewiki 188\n",
"175 shwiki 10593\n",
"176 simplewiki 10363\n",
"177 siwiki 494\n",
"178 skrwiki 172\n",
"179 skwiki 12741\n",
"180 slwiki 8872\n",
"181 smnwiki 275\n",
"182 snwiki 41\n",
"183 sqwiki 7061\n",
"184 srnwiki 17\n",
"185 srwiki 14390\n",
"186 sswiki 9\n",
"187 stwiki 7\n",
"188 suwiki 1912\n",
"189 swwiki 3335\n",
"190 szlwiki 323\n",
"191 tawiki 5337\n",
"192 tcywiki 84\n",
"193 tetwiki 28\n",
"194 tgwiki 1267\n",
"195 thwiki 8054\n",
"196 tlwiki 1897\n",
"197 tnwiki 2\n",
"198 tpiwiki 40\n",
"199 trwiki 16377\n",
"201 ttwiki 3588\n",
"202 twwiki 4\n",
"203 tyvwiki 75\n",
"204 ukwiki 21303\n",
"205 urwiki 3707\n",
"206 uzwiki 5880\n",
"207 vecwiki 2774\n",
"208 vepwiki 3314\n",
"209 vewiki 1\n",
"210 viwiki 9965\n",
"211 vowiki 16\n",
"213 warwiki 978\n",
"214 wawiki 80\n",
"215 wowiki 12\n",
"216 wuuwiki 963\n",
"217 xalwiki 102\n",
"218 yiwiki 1067\n",
"219 yowiki 54\n",
"220 zeawiki 380\n",
"222 zh_yuewiki 1970\n",
"223 zhwiki 12080\n"
]
}
],
"source": [
"print(rec_stats.to_string())"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "303179b4",
"metadata": {},
"outputs": [],
"source": [
"rec_stats.to_csv(\"/home/mnz/one_offs/rec_stats_20220607.tsv\", sep=\"\\t\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "07f72578",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
%% Cell type:code id:d14418ff tags:
```
python
import
wmfdata
spark
=
wmfdata
.
spark
.
get_session
(
type
=
"yarn-regular"
,
extra_settings
=
{
"spark.sql.shuffle.partitions"
:
2048
}
)
```
%% Output
PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.
%% Cell type:code id:6e585b05 tags:
```
python
import
os
import
subprocess
from
pyspark.sql
import
functions
as
F
from
pyspark.sql
import
types
as
T
```
%% Cell type:code id:425af15f tags:
```
python
# get all recommendation file paths from hdfs
cmd
=
"hadoop fs -ls /user/mnz/imagerecs/recs-2022-06-07"
output
=
subprocess
.
check_output
(
cmd
,
shell
=
True
).
decode
(
"utf-8"
).
strip
().
split
(
"
\n
"
)
rec_files
=
[
path
.
split
()[
-
1
]
for
path
in
output
if
path
.
endswith
(
".parquet"
)]
```
%% Cell type:code id:29267692 tags:
```
python
recs_count
=
{}
for
file
in
rec_files
:
image_recommendations_df
=
spark
.
read
.
parquet
(
file
)
# the recommended_images col contains an array of
# maps (wiki_db -> [images]) so we need to explode
# it in order to get to individual maps
exploded_recs_df
=
image_recommendations_df
.
select
(
"*"
,
F
.
explode
(
"recommended_images"
).
alias
(
"images"
)
)
# explode the maps, aggregate same images across
# all languages for each section and filter out any
# images that occur in less than 5 languages
filtered_recs_df
=
(
exploded_recs_df
.
select
(
"item_id"
,
"target_title"
,
"target_heading"
,
F
.
explode
(
F
.
col
(
"images"
))
)
.
withColumn
(
"image"
,
F
.
explode
(
F
.
col
(
"value"
)).
alias
(
"image"
)
)
.
groupBy
([
"item_id"
,
"target_title"
,
"target_heading"
,
"image"
])
.
agg
(
F
.
count
(
"*"
).
alias
(
"frequency"
))
.
filter
(
F
.
col
(
"frequency"
)
>=
5
)
)
wiki_db
=
os
.
path
.
basename
(
file
).
split
(
"."
)[
0
].
replace
(
"_recommendations"
,
""
)
recs_count
[
wiki_db
]
=
filtered_recs_df
.
count
()
```
%% Cell type:code id:e6630f38 tags:
```
python
rec_stats
=
pd
.
DataFrame
.
from_dict
(
recs_count
,
orient
=
"index"
,
columns
=
[
"recommendations"
])
rec_stats
=
rec_stats
.
sort_index
().
reset_index
().
rename
(
columns
=
{
"index"
:
"wiki_db"
})
rec_stats
=
rec_stats
[
rec_stats
[
"recommendations"
]
>
0
]
```
%% Cell type:code id:a3e281e6 tags:
```
python
print
(
rec_stats
.
to_string
())
```
%% Output
wiki_db recommendations
0 abwiki 1360
1 acewiki 19
2 adywiki 1332
3 afwiki 6166
5 amwiki 3
6 angwiki 24
7 anwiki 2700
9 arwiki 10214
10 arywiki 184
11 arzwiki 3691
12 avwiki 627
13 aywiki 6
14 azbwiki 955
15 azwiki 8605
16 banwiki 200
17 barwiki 926
18 bawiki 4322
19 bclwiki 237
21 bewiki 9051
22 bgwiki 16926
24 biwiki 4
26 bnwiki 7933
28 bpywiki 754
29 brwiki 354
30 bswiki 8722
31 cawiki 17232
32 cbk_zamwiki 446
34 cebwiki 287
36 chwiki 20
37 ckbwiki 12
38 cowiki 1094
41 cswiki 20171
42 cvwiki 1743
43 cywiki 3393
44 dawiki 11887
45 dewiki 17406
47 diqwiki 719
48 dsbwiki 748
49 dtywiki 134
51 enwiki 16865
52 eswiki 18019
53 etwiki 12326
54 euwiki 12949
55 extwiki 418
56 fawiki 13017
57 ffwiki 30
58 fiwiki 14673
59 frrwiki 981
60 frwiki 22328
61 furwiki 748
62 fywiki 5713
63 gagwiki 59
64 ganwiki 613
65 gdwiki 1224
66 glkwiki 96
67 gnwiki 5
68 gomwiki 140
69 gorwiki 57
72 guwiki 1122
74 hawwiki 35
75 hewiki 13434
76 hifwiki 244
77 hrwiki 11298
78 hsbwiki 487
79 htwiki 1040
80 huwiki 15820
81 hywiki 10558
82 hywwiki 2267
83 iawiki 1453
84 iewiki 896
85 igwiki 31
86 inhwiki 54
87 iowiki 3691
88 iswiki 3374
89 itwiki 23174
90 jawiki 14412
92 jvwiki 1986
93 kaawiki 131
94 kabwiki 11
95 kawiki 7636
96 kbdwiki 75
97 kgwiki 23
99 kkwiki 5696
100 kmwiki 1007
101 kowiki 11440
102 krcwiki 1106
103 kshwiki 13
104 kswiki 2
105 kuwiki 1361
106 kvwiki 241
107 kwwiki 64
108 kywiki 1971
109 lawiki 4851
110 lbewiki 40
111 lbwiki 3406
112 lezwiki 928
113 lfnwiki 946
114 lldwiki 1370
115 lmowiki 1560
116 lnwiki 1577
117 lowiki 301
118 ltwiki 14459
119 lvwiki 10600
121 maiwiki 2163
122 mdfwiki 1754
123 mgwiki 1022
124 minwiki 333
125 mkwiki 8809
126 mlwiki 3667
128 mnwiki 2335
129 mnwwiki 34
130 mrjwiki 1030
131 mrwiki 6291
132 mtwiki 1004
133 mwlwiki 621
134 mywiki 955
135 mznwiki 1848
136 nahwiki 125
138 napwiki 565
140 nds_nlwiki 802
141 newiki 2980
142 newwiki 1821
143 nlwiki 21374
144 nowiki 14115
146 orwiki 660
147 oswiki 678
148 pamwiki 251
149 papwiki 275
150 pawiki 2095
151 pcdwiki 166
152 pdcwiki 33
153 pihwiki 6
154 plwiki 20330
155 pmswiki 1001
156 pnbwiki 2351
157 pswiki 2586
158 ptwiki 18488
159 quwiki 82
160 rmwiki 1128
161 rmywiki 2
162 ruewiki 997
164 rwwiki 13
165 sahwiki 632
167 sawiki 407
168 scnwiki 2563
169 scowiki 1302
170 scwiki 886
171 sdwiki 983
172 sewiki 188
175 shwiki 10593
176 simplewiki 10363
177 siwiki 494
178 skrwiki 172
179 skwiki 12741
180 slwiki 8872
181 smnwiki 275
182 snwiki 41
183 sqwiki 7061
184 srnwiki 17
185 srwiki 14390
186 sswiki 9
187 stwiki 7
188 suwiki 1912
189 swwiki 3335
190 szlwiki 323
191 tawiki 5337
192 tcywiki 84
193 tetwiki 28
194 tgwiki 1267
195 thwiki 8054
196 tlwiki 1897
197 tnwiki 2
198 tpiwiki 40
199 trwiki 16377
201 ttwiki 3588
202 twwiki 4
203 tyvwiki 75
204 ukwiki 21303
205 urwiki 3707
206 uzwiki 5880
207 vecwiki 2774
208 vepwiki 3314
209 vewiki 1
210 viwiki 9965
211 vowiki 16
213 warwiki 978
214 wawiki 80
215 wowiki 12
216 wuuwiki 963
217 xalwiki 102
218 yiwiki 1067
219 yowiki 54
220 zeawiki 380
222 zh_yuewiki 1970
223 zhwiki 12080
%% Cell type:code id:303179b4 tags:
```
python
rec_stats
.
to_csv
(
"/home/mnz/one_offs/rec_stats_20220607.tsv"
,
sep
=
"
\t
"
,
index
=
False
)
```
%% Cell type:code id:07f72578 tags:
```
python
```
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment