# Data Literacy - Project
## Gender Share in Movies
#### Tobias Stumpp, Sophia Herrmann

### README & TODO

Please run all cells of this ipython document once. You may use the button that's revealed by executing the next cell.  
With an execution, this document prepares and provides files as a preprocessing step for all the experiments in this repository.

In [1]:
from IPython.display import Javascript, display
from ipywidgets import widgets

def run_all(ev):
    Javascript('IPython.Application.instance().kernel.do_shutdown(True)')
    display(Javascript('IPython.notebook.execute_cell_range(IPython.notebook.get_selected_index()+1, IPython.notebook.ncells())'))

display("Please click this button below to provide the required preprocessed data files for the experiments:")
button = widgets.Button(description="Run all cells below")
button.on_click(run_all)
display(button)

'Please click this button below to provide the required preprocessed data files for the experiments:'

Button(description='Run all cells below', style=ButtonStyle())

In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
path = '../'
os.chdir(path)

### Extract data archive files

In [4]:
import gzip
import shutil

files = [
    'dat/title.basics.tsv.gz',
    'dat/title.principals.tsv.gz',
    'dat/title.ratings.tsv.gz',
]

def unzip(files=files):
    for file in files:
        if file.endswith('.gz'):
            with gzip.open(file, 'rb') as f_in:
                with open(file[:-3], 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

unzip(files)

### Read data files

In [5]:
data_film = pd.read_csv('dat/title.basics.tsv', sep='\t', na_values=['\\N'], dtype={
    "isAdult": bool,
    "startYear": float,
    "endYear": float, 
    "runtimeMinutes": float,
    },
    # Skip lines that are syntactically incorrect and would therefore cause
    # - a column shift within the row
    # - assignment errors for column datatypes
    skiprows=[
        1098292,
        1510501,
        1900901,
        2012237,
        2167663,
        2313911,
        3012068,
        5964307,
        8605235,
        8645208,
    ]
)
# tconst (string) - alphanumeric unique identifier of the title
# titleType (string) – the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc)
# primaryTitle (string) – the more popular title / the title used by the filmmakers on promotional materials at the point of release
# originalTitle (string) - original title, in the original language
# isAdult (boolean) - 0: non-adult title; 1: adult title
# startYear (YYYY) – represents the release year of a title. In the case of TV Series, it is the series start year
# endYear (YYYY) – TV Series end year. ‘\N’ for all other title types
# runtimeMinutes – primary runtime of the title, in minutes
# genres (string array) – includes up to three genres associated with the title

data_rating = data = pd.read_csv('dat/title.ratings.tsv', sep='\t', na_values=['\\N'], dtype={
    "averageRating": float,
    "numVotes": float,
})
# tconst (string) - alphanumeric unique identifier of the title
# averageRating – weighted average of all the individual user ratings
# numVotes - number of votes the title has received

data_principals = pd.read_csv('dat/title.principals.tsv', sep='\t', na_values=['\\N'], dtype={
    "ordering": float,
})
# tconst (string) - alphanumeric unique identifier of the title
# ordering (integer) – a number to uniquely identify rows for a given titleId
# nconst (string) - alphanumeric unique identifier of the name/person
# category (string) - the category of job that person was in
# job (string) - the specific job title if applicable, else '\N'
# characters (string) - the name of the character played if applicable, else '\N'

----------

###  Clean and merge original data into prepared datasets for experiments

In [6]:
# Keep only higher quality movies, hence,
# - drop rows whose types aren't movies
data_film.drop(data_film.index[(data_film["titleType"] != "movie")], axis = 0, inplace=True)

# - drop rows with *untypical runtime minutes*
movies_count_before = data_film.shape[0]
quantile = data_film["runtimeMinutes"].quantile([0.05,0.95])

data_film = data_film[
    (data_film["runtimeMinutes"] >= quantile[0.05]) &
    (data_film["runtimeMinutes"] <= quantile[0.95])
]

movies_count_after = data_film.shape[0]

display(f"Initially, the dataset contains {movies_count_before} movies.")
display(f"Quantiles 5% and 95% on runtime minutes yield as delimitation minutes {list(quantile)}.")
display(f"After dropping rows of these quantiles, the dataset contains {movies_count_after} movies, which is {movies_count_before-movies_count_after} less movies.")

'Initially, the dataset contains 600289 movies.'

'Quantiles 5% and 95% on runtime minutes yield as delimitation minutes [52.0, 135.0].'

'After dropping rows of these quantiles, the dataset contains 341225 movies, which is 259064 less movies.'

In [7]:
# Drop features that for our analysis are either irrelevant or incomplete
data_film.drop(["titleType", "primaryTitle", "originalTitle", "isAdult", "endYear"], axis = 1, inplace=True)
data_film.dropna(subset=["startYear", "runtimeMinutes"], inplace=True)

data_principals.drop(["ordering", "nconst", "job", "characters"], axis = 1, inplace=True)

# Filter principal cast members for only actors and actresses
data_principals = data_principals[
    (data_principals["category"] == "actor") |
    (data_principals["category"] == "actress")
]

In [8]:
# Merge movie data
data_movie = pd.merge(data_film, data_rating, how="inner", on="tconst")
data_movie = pd.merge(data_movie, data_principals, how="inner", on="tconst")

In [9]:
# Provide atomic genre data on movies
data_movie_genre = data_film.copy()

# Drop features that for genres are irrelevant or incomplete
data_movie_genre.drop(["startYear", "runtimeMinutes"], axis=1, inplace=True)
data_movie_genre.dropna(subset=["genres"], inplace=True)

# Break down genre to atomic data
data_movie_genre["genres"] = data_movie_genre["genres"].str.split(",")
data_movie_genre = data_movie_genre.explode("genres").reset_index(drop=True)

# Correct column title to fit atomic data
data_movie_genre = data_movie_genre.rename(columns = {"genres": "genre"})

### Convert integer numbers to integer datatypes

In [10]:
data_movie["startYear"] = data_movie["startYear"].astype(int)
data_movie["runtimeMinutes"] = data_movie["runtimeMinutes"].astype(int)
data_movie["numVotes"] = data_movie["numVotes"].astype(int)

### Write preprocessed data to files

In [11]:
data_movie.to_csv("dat/data_movie.csv", index=False)
data_movie_genre.to_csv("dat/data_movie_genre.csv", index=False)