| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353 |
- {
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Data Literacy - Project\n",
- "## Gender Share in Movies\n",
- "#### Tobias Stumpp, Sophia Herrmann"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### README & TODO\n",
- "\n",
- "Please run all cells of this ipython document once. You may use the button that's revealed by executing the next cell. \n",
- "With an execution, this document prepares and provides files as a preprocessing step for all the experiments in this repository."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'Please click this button below to provide the required preprocessed data files for the experiments:'"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "6d505788715c424d9776b876af6b6290",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Button(description='Run all cells below', style=ButtonStyle())"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "from IPython.display import Javascript, display\n",
- "from ipywidgets import widgets\n",
- "\n",
- "def run_all(ev):\n",
- " Javascript('IPython.Application.instance().kernel.do_shutdown(True)')\n",
- " display(Javascript('IPython.notebook.execute_cell_range(IPython.notebook.get_selected_index()+1, IPython.notebook.ncells())'))\n",
- "\n",
- "display(\"Please click this button below to provide the required preprocessed data files for the experiments:\")\n",
- "button = widgets.Button(description=\"Run all cells below\")\n",
- "button.on_click(run_all)\n",
- "display(button)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "import numpy as np\n",
- "import pandas as pd\n",
- "import os"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "path = '../'\n",
- "os.chdir(path)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Extract data archive files"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "import gzip\n",
- "import shutil\n",
- "\n",
- "files = [\n",
- " 'dat/title.basics.tsv.gz',\n",
- " 'dat/title.principals.tsv.gz',\n",
- " 'dat/title.ratings.tsv.gz',\n",
- "]\n",
- "\n",
- "def unzip(files=files):\n",
- " for file in files:\n",
- " if file.endswith('.gz'):\n",
- " with gzip.open(file, 'rb') as f_in:\n",
- " with open(file[:-3], 'wb') as f_out:\n",
- " shutil.copyfileobj(f_in, f_out)\n",
- "\n",
- "unzip(files)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Read data files"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "data_film = pd.read_csv('dat/title.basics.tsv', sep='\\t', na_values=['\\\\N'], dtype={\n",
- " \"isAdult\": bool,\n",
- " \"startYear\": float,\n",
- " \"endYear\": float, \n",
- " \"runtimeMinutes\": float,\n",
- " },\n",
- " # Skip lines that are syntactically incorrect and would therefore cause\n",
- " # - a column shift within the row\n",
- " # - assignment errors for column datatypes\n",
- " skiprows=[\n",
- " 1098292,\n",
- " 1510501,\n",
- " 1900901,\n",
- " 2012237,\n",
- " 2167663,\n",
- " 2313911,\n",
- " 3012068,\n",
- " 5964307,\n",
- " 8605235,\n",
- " 8645208,\n",
- " ]\n",
- ")\n",
- "# tconst (string) - alphanumeric unique identifier of the title\n",
- "# titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)\n",
- "# primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release\n",
- "# originalTitle (string) - original title, in the original language\n",
- "# isAdult (boolean) - 0: non-adult title; 1: adult title\n",
- "# startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year\n",
- "# endYear (YYYY) – TV Series end year. ‘\\N’ for all other title types\n",
- "# runtimeMinutes – primary runtime of the title, in minutes\n",
- "# genres (string array) – includes up to three genres associated with the title\n",
- "\n",
- "data_rating = data = pd.read_csv('dat/title.ratings.tsv', sep='\\t', na_values=['\\\\N'], dtype={\n",
- " \"averageRating\": float,\n",
- " \"numVotes\": float,\n",
- "})\n",
- "# tconst (string) - alphanumeric unique identifier of the title\n",
- "# averageRating – weighted average of all the individual user ratings\n",
- "# numVotes - number of votes the title has received\n",
- "\n",
- "data_principals = pd.read_csv('dat/title.principals.tsv', sep='\\t', na_values=['\\\\N'], dtype={\n",
- " \"ordering\": float,\n",
- "})\n",
- "# tconst (string) - alphanumeric unique identifier of the title\n",
- "# ordering (integer) – a number to uniquely identify rows for a given titleId\n",
- "# nconst (string) - alphanumeric unique identifier of the name/person\n",
- "# category (string) - the category of job that person was in\n",
- "# job (string) - the specific job title if applicable, else '\\N'\n",
- "# characters (string) - the name of the character played if applicable, else '\\N'"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "----------\n",
- "\n",
- "### Clean and merge original data into prepared datasets for experiments"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'Initially, the dataset contains 600289 movies.'"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "'Quantiles 5% and 95% on runtime minutes yield as delimitation minutes [52.0, 135.0].'"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "'After dropping rows of these quantiles, the dataset contains 341225 movies, which is 259064 less movies.'"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Keep only higher quality movies, hence,\n",
- "# - drop rows whose types aren't movies\n",
- "data_film.drop(data_film.index[(data_film[\"titleType\"] != \"movie\")], axis = 0, inplace=True)\n",
- "\n",
- "# - drop rows with *untypical runtime minutes*\n",
- "movies_count_before = data_film.shape[0]\n",
- "quantile = data_film[\"runtimeMinutes\"].quantile([0.05,0.95])\n",
- "\n",
- "data_film = data_film[\n",
- " (data_film[\"runtimeMinutes\"] >= quantile[0.05]) &\n",
- " (data_film[\"runtimeMinutes\"] <= quantile[0.95])\n",
- "]\n",
- "\n",
- "movies_count_after = data_film.shape[0]\n",
- "\n",
- "display(f\"Initially, the dataset contains {movies_count_before} movies.\")\n",
- "display(f\"Quantiles 5% and 95% on runtime minutes yield as delimitation minutes {list(quantile)}.\")\n",
- "display(f\"After dropping rows of these quantiles, the dataset contains {movies_count_after} movies, which is {movies_count_before-movies_count_after} less movies.\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Drop features that for our analysis are either irrelevant or incomplete\n",
- "data_film.drop([\"titleType\", \"primaryTitle\", \"originalTitle\", \"isAdult\", \"endYear\"], axis = 1, inplace=True)\n",
- "data_film.dropna(subset=[\"startYear\", \"runtimeMinutes\"], inplace=True)\n",
- "\n",
- "data_principals.drop([\"ordering\", \"nconst\", \"job\", \"characters\"], axis = 1, inplace=True)\n",
- "\n",
- "# Filter principal cast members for only actors and actresses\n",
- "data_principals = data_principals[\n",
- " (data_principals[\"category\"] == \"actor\") |\n",
- " (data_principals[\"category\"] == \"actress\")\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Merge movie data\n",
- "data_movie = pd.merge(data_film, data_rating, how=\"inner\", on=\"tconst\")\n",
- "data_movie = pd.merge(data_movie, data_principals, how=\"inner\", on=\"tconst\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Provide atomic genre data on movies\n",
- "data_movie_genre = data_film.copy()\n",
- "\n",
- "# Drop features that for genres are irrelevant or incomplete\n",
- "data_movie_genre.drop([\"startYear\", \"runtimeMinutes\"], axis=1, inplace=True)\n",
- "data_movie_genre.dropna(subset=[\"genres\"], inplace=True)\n",
- "\n",
- "# Break down genre to atomic data\n",
- "data_movie_genre[\"genres\"] = data_movie_genre[\"genres\"].str.split(\",\")\n",
- "data_movie_genre = data_movie_genre.explode(\"genres\").reset_index(drop=True)\n",
- "\n",
- "# Correct column title to fit atomic data\n",
- "data_movie_genre = data_movie_genre.rename(columns = {\"genres\": \"genre\"})"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Convert integer numbers to integer datatypes"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "data_movie[\"startYear\"] = data_movie[\"startYear\"].astype(int)\n",
- "data_movie[\"runtimeMinutes\"] = data_movie[\"runtimeMinutes\"].astype(int)\n",
- "data_movie[\"numVotes\"] = data_movie[\"numVotes\"].astype(int)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Write preprocessed data to files"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "data_movie.to_csv(\"dat/data_movie.csv\", index=False)\n",
- "data_movie_genre.to_csv(\"dat/data_movie_genre.csv\", index=False)"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.8"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
- }
|