Team project "Analyzing Gender Share in Casting Actors" as part of the lecture "Data Literacy"

exp-001_Data-Preprocessing-and-Provisioning.ipynb 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# Data Literacy - Project\n",
  8. "## Gender Share in Movies\n",
  9. "#### Tobias Stumpp, Sophia Herrmann"
  10. ]
  11. },
  12. {
  13. "cell_type": "markdown",
  14. "metadata": {},
  15. "source": [
  16. "### README & TODO\n",
  17. "\n",
  18. "Please run all cells of this ipython document once. You may use the button that's revealed by executing the next cell. \n",
  19. "With an execution, this document prepares and provides files as a preprocessing step for all the experiments in this repository."
  20. ]
  21. },
  22. {
  23. "cell_type": "code",
  24. "execution_count": 1,
  25. "metadata": {},
  26. "outputs": [
  27. {
  28. "data": {
  29. "text/plain": [
  30. "'Please click this button below to provide the required preprocessed data files for the experiments:'"
  31. ]
  32. },
  33. "metadata": {},
  34. "output_type": "display_data"
  35. },
  36. {
  37. "data": {
  38. "application/vnd.jupyter.widget-view+json": {
  39. "model_id": "6d505788715c424d9776b876af6b6290",
  40. "version_major": 2,
  41. "version_minor": 0
  42. },
  43. "text/plain": [
  44. "Button(description='Run all cells below', style=ButtonStyle())"
  45. ]
  46. },
  47. "metadata": {},
  48. "output_type": "display_data"
  49. }
  50. ],
  51. "source": [
  52. "from IPython.display import Javascript, display\n",
  53. "from ipywidgets import widgets\n",
  54. "\n",
  55. "def run_all(ev):\n",
  56. " Javascript('IPython.Application.instance().kernel.do_shutdown(True)')\n",
  57. " display(Javascript('IPython.notebook.execute_cell_range(IPython.notebook.get_selected_index()+1, IPython.notebook.ncells())'))\n",
  58. "\n",
  59. "display(\"Please click this button below to provide the required preprocessed data files for the experiments:\")\n",
  60. "button = widgets.Button(description=\"Run all cells below\")\n",
  61. "button.on_click(run_all)\n",
  62. "display(button)"
  63. ]
  64. },
  65. {
  66. "cell_type": "code",
  67. "execution_count": 2,
  68. "metadata": {},
  69. "outputs": [],
  70. "source": [
  71. "import numpy as np\n",
  72. "import pandas as pd\n",
  73. "import os"
  74. ]
  75. },
  76. {
  77. "cell_type": "code",
  78. "execution_count": 3,
  79. "metadata": {},
  80. "outputs": [],
  81. "source": [
  82. "path = '../'\n",
  83. "os.chdir(path)"
  84. ]
  85. },
  86. {
  87. "cell_type": "markdown",
  88. "metadata": {},
  89. "source": [
  90. "### Extract data archive files"
  91. ]
  92. },
  93. {
  94. "cell_type": "code",
  95. "execution_count": 4,
  96. "metadata": {},
  97. "outputs": [],
  98. "source": [
  99. "import gzip\n",
  100. "import shutil\n",
  101. "\n",
  102. "files = [\n",
  103. " 'dat/title.basics.tsv.gz',\n",
  104. " 'dat/title.principals.tsv.gz',\n",
  105. " 'dat/title.ratings.tsv.gz',\n",
  106. "]\n",
  107. "\n",
  108. "def unzip(files=files):\n",
  109. " for file in files:\n",
  110. " if file.endswith('.gz'):\n",
  111. " with gzip.open(file, 'rb') as f_in:\n",
  112. " with open(file[:-3], 'wb') as f_out:\n",
  113. " shutil.copyfileobj(f_in, f_out)\n",
  114. "\n",
  115. "unzip(files)"
  116. ]
  117. },
  118. {
  119. "cell_type": "markdown",
  120. "metadata": {},
  121. "source": [
  122. "### Read data files"
  123. ]
  124. },
  125. {
  126. "cell_type": "code",
  127. "execution_count": 5,
  128. "metadata": {},
  129. "outputs": [],
  130. "source": [
  131. "data_film = pd.read_csv('dat/title.basics.tsv', sep='\\t', na_values=['\\\\N'], dtype={\n",
  132. " \"isAdult\": bool,\n",
  133. " \"startYear\": float,\n",
  134. " \"endYear\": float, \n",
  135. " \"runtimeMinutes\": float,\n",
  136. " },\n",
  137. " # Skip lines that are syntactically incorrect and would therefore cause\n",
  138. " # - a column shift within the row\n",
  139. " # - assignment errors for column datatypes\n",
  140. " skiprows=[\n",
  141. " 1098292,\n",
  142. " 1510501,\n",
  143. " 1900901,\n",
  144. " 2012237,\n",
  145. " 2167663,\n",
  146. " 2313911,\n",
  147. " 3012068,\n",
  148. " 5964307,\n",
  149. " 8605235,\n",
  150. " 8645208,\n",
  151. " ]\n",
  152. ")\n",
  153. "# tconst (string) - alphanumeric unique identifier of the title\n",
  154. "# titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)\n",
  155. "# primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release\n",
  156. "# originalTitle (string) - original title, in the original language\n",
  157. "# isAdult (boolean) - 0: non-adult title; 1: adult title\n",
  158. "# startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year\n",
  159. "# endYear (YYYY) – TV Series end year. ‘\\N’ for all other title types\n",
  160. "# runtimeMinutes – primary runtime of the title, in minutes\n",
  161. "# genres (string array) – includes up to three genres associated with the title\n",
  162. "\n",
  163. "data_rating = data = pd.read_csv('dat/title.ratings.tsv', sep='\\t', na_values=['\\\\N'], dtype={\n",
  164. " \"averageRating\": float,\n",
  165. " \"numVotes\": float,\n",
  166. "})\n",
  167. "# tconst (string) - alphanumeric unique identifier of the title\n",
  168. "# averageRating – weighted average of all the individual user ratings\n",
  169. "# numVotes - number of votes the title has received\n",
  170. "\n",
  171. "data_principals = pd.read_csv('dat/title.principals.tsv', sep='\\t', na_values=['\\\\N'], dtype={\n",
  172. " \"ordering\": float,\n",
  173. "})\n",
  174. "# tconst (string) - alphanumeric unique identifier of the title\n",
  175. "# ordering (integer) – a number to uniquely identify rows for a given titleId\n",
  176. "# nconst (string) - alphanumeric unique identifier of the name/person\n",
  177. "# category (string) - the category of job that person was in\n",
  178. "# job (string) - the specific job title if applicable, else '\\N'\n",
  179. "# characters (string) - the name of the character played if applicable, else '\\N'"
  180. ]
  181. },
  182. {
  183. "cell_type": "markdown",
  184. "metadata": {},
  185. "source": [
  186. "----------\n",
  187. "\n",
  188. "### Clean and merge original data into prepared datasets for experiments"
  189. ]
  190. },
  191. {
  192. "cell_type": "code",
  193. "execution_count": 6,
  194. "metadata": {},
  195. "outputs": [
  196. {
  197. "data": {
  198. "text/plain": [
  199. "'Initially, the dataset contains 600289 movies.'"
  200. ]
  201. },
  202. "metadata": {},
  203. "output_type": "display_data"
  204. },
  205. {
  206. "data": {
  207. "text/plain": [
  208. "'Quantiles 5% and 95% on runtime minutes yield as delimitation minutes [52.0, 135.0].'"
  209. ]
  210. },
  211. "metadata": {},
  212. "output_type": "display_data"
  213. },
  214. {
  215. "data": {
  216. "text/plain": [
  217. "'After dropping rows of these quantiles, the dataset contains 341225 movies, which is 259064 less movies.'"
  218. ]
  219. },
  220. "metadata": {},
  221. "output_type": "display_data"
  222. }
  223. ],
  224. "source": [
  225. "# Keep only higher quality movies, hence,\n",
  226. "# - drop rows whose types aren't movies\n",
  227. "data_film.drop(data_film.index[(data_film[\"titleType\"] != \"movie\")], axis = 0, inplace=True)\n",
  228. "\n",
  229. "# - drop rows with *untypical runtime minutes*\n",
  230. "movies_count_before = data_film.shape[0]\n",
  231. "quantile = data_film[\"runtimeMinutes\"].quantile([0.05,0.95])\n",
  232. "\n",
  233. "data_film = data_film[\n",
  234. " (data_film[\"runtimeMinutes\"] >= quantile[0.05]) &\n",
  235. " (data_film[\"runtimeMinutes\"] <= quantile[0.95])\n",
  236. "]\n",
  237. "\n",
  238. "movies_count_after = data_film.shape[0]\n",
  239. "\n",
  240. "display(f\"Initially, the dataset contains {movies_count_before} movies.\")\n",
  241. "display(f\"Quantiles 5% and 95% on runtime minutes yield as delimitation minutes {list(quantile)}.\")\n",
  242. "display(f\"After dropping rows of these quantiles, the dataset contains {movies_count_after} movies, which is {movies_count_before-movies_count_after} less movies.\")"
  243. ]
  244. },
  245. {
  246. "cell_type": "code",
  247. "execution_count": 7,
  248. "metadata": {},
  249. "outputs": [],
  250. "source": [
  251. "# Drop features that for our analysis are either irrelevant or incomplete\n",
  252. "data_film.drop([\"titleType\", \"primaryTitle\", \"originalTitle\", \"isAdult\", \"endYear\"], axis = 1, inplace=True)\n",
  253. "data_film.dropna(subset=[\"startYear\", \"runtimeMinutes\"], inplace=True)\n",
  254. "\n",
  255. "data_principals.drop([\"ordering\", \"nconst\", \"job\", \"characters\"], axis = 1, inplace=True)\n",
  256. "\n",
  257. "# Filter principal cast members for only actors and actresses\n",
  258. "data_principals = data_principals[\n",
  259. " (data_principals[\"category\"] == \"actor\") |\n",
  260. " (data_principals[\"category\"] == \"actress\")\n",
  261. "]"
  262. ]
  263. },
  264. {
  265. "cell_type": "code",
  266. "execution_count": 8,
  267. "metadata": {},
  268. "outputs": [],
  269. "source": [
  270. "# Merge movie data\n",
  271. "data_movie = pd.merge(data_film, data_rating, how=\"inner\", on=\"tconst\")\n",
  272. "data_movie = pd.merge(data_movie, data_principals, how=\"inner\", on=\"tconst\")"
  273. ]
  274. },
  275. {
  276. "cell_type": "code",
  277. "execution_count": 9,
  278. "metadata": {},
  279. "outputs": [],
  280. "source": [
  281. "# Provide atomic genre data on movies\n",
  282. "data_movie_genre = data_film.copy()\n",
  283. "\n",
  284. "# Drop features that for genres are irrelevant or incomplete\n",
  285. "data_movie_genre.drop([\"startYear\", \"runtimeMinutes\"], axis=1, inplace=True)\n",
  286. "data_movie_genre.dropna(subset=[\"genres\"], inplace=True)\n",
  287. "\n",
  288. "# Break down genre to atomic data\n",
  289. "data_movie_genre[\"genres\"] = data_movie_genre[\"genres\"].str.split(\",\")\n",
  290. "data_movie_genre = data_movie_genre.explode(\"genres\").reset_index(drop=True)\n",
  291. "\n",
  292. "# Correct column title to fit atomic data\n",
  293. "data_movie_genre = data_movie_genre.rename(columns = {\"genres\": \"genre\"})"
  294. ]
  295. },
  296. {
  297. "cell_type": "markdown",
  298. "metadata": {},
  299. "source": [
  300. "### Convert integer numbers to integer datatypes"
  301. ]
  302. },
  303. {
  304. "cell_type": "code",
  305. "execution_count": 10,
  306. "metadata": {},
  307. "outputs": [],
  308. "source": [
  309. "data_movie[\"startYear\"] = data_movie[\"startYear\"].astype(int)\n",
  310. "data_movie[\"runtimeMinutes\"] = data_movie[\"runtimeMinutes\"].astype(int)\n",
  311. "data_movie[\"numVotes\"] = data_movie[\"numVotes\"].astype(int)"
  312. ]
  313. },
  314. {
  315. "cell_type": "markdown",
  316. "metadata": {},
  317. "source": [
  318. "### Write preprocessed data to files"
  319. ]
  320. },
  321. {
  322. "cell_type": "code",
  323. "execution_count": 11,
  324. "metadata": {},
  325. "outputs": [],
  326. "source": [
  327. "data_movie.to_csv(\"dat/data_movie.csv\", index=False)\n",
  328. "data_movie_genre.to_csv(\"dat/data_movie_genre.csv\", index=False)"
  329. ]
  330. }
  331. ],
  332. "metadata": {
  333. "kernelspec": {
  334. "display_name": "Python 3",
  335. "language": "python",
  336. "name": "python3"
  337. },
  338. "language_info": {
  339. "codemirror_mode": {
  340. "name": "ipython",
  341. "version": 3
  342. },
  343. "file_extension": ".py",
  344. "mimetype": "text/x-python",
  345. "name": "python",
  346. "nbconvert_exporter": "python",
  347. "pygments_lexer": "ipython3",
  348. "version": "3.8.8"
  349. }
  350. },
  351. "nbformat": 4,
  352. "nbformat_minor": 4
  353. }

Powered by TurnKey Linux.