# Importing pandas
import pandas as pd

# Loading the 1st dataset
acquisitions_data = pd.read_csv('data/acquisitions.csv')
degrees_data = pd.read_csv('data/degrees.csv')
funding_rounds_data = pd.read_csv('data/funding_rounds.csv')
funds_data = pd.read_csv('data/funds.csv')
investments_data = pd.read_csv('data/investments.csv')
ipos_data = pd.read_csv('data/ipos.csv')
milestones_data = pd.read_csv('data/milestones.csv')
objects_data = pd.read_csv('data/objects.csv')
offices_data = pd.read_csv('data/offices.csv')
people_data = pd.read_csv('data/people.csv')
relationships_data = pd.read_csv('data/relationships.csv')

objects_data.head()

# List of all DataFrames and their names
dataframes = {
    'acquisitions_data': acquisitions_data,
    'degrees_data': degrees_data,
    'funding_rounds_data': funding_rounds_data,
    'funds_data': funds_data,
    'investments_data': investments_data,
    'ipos_data': ipos_data,
    'milestones_data': milestones_data,
    'objects_data': objects_data,
    'offices_data': offices_data,
    'people_data': people_data,
    'relationships_data': relationships_data,
}

# Create a summary table
summary = []
for name, df in dataframes.items():
    summary.append({
        'Dataset': name,
        'Rows': df.shape[0],
        'Columns': df.shape[1],
        'Column Names': df.columns.tolist()[1:]
    })

# Convert summary to DataFrame
summary_df = pd.DataFrame(summary)

# Display summary
print(summary_df)

                Dataset    Rows  Columns  \
0     acquisitions_data    9562       12   
1          degrees_data  109610        8   
2   funding_rounds_data   52928       23   
3            funds_data    1564       11   
4      investments_data   80902        6   
5             ipos_data    1259       13   
6       milestones_data   39456        9   
7          objects_data  462651       40   
8          offices_data  112718       15   
9           people_data  226709        6   
10   relationships_data  402878       11   

                                         Column Names  
0   [acquisition_id, acquiring_object_id, acquired...  
1   [object_id, degree_type, subject, institution,...  
2   [funding_round_id, object_id, funded_at, fundi...  
3   [fund_id, object_id, name, funded_at, raised_a...  
4   [funding_round_id, funded_object_id, investor_...  
5   [ipo_id, object_id, valuation_amount, valuatio...  
6   [object_id, milestone_at, milestone_code, desc...  
7   [entity_type, entity_id, parent_id, name, norm...  
8   [object_id, office_id, description, region, ad...  
9   [object_id, first_name, last_name, birthplace,...  
10  [relationship_id, person_object_id, relationsh...

# Lets start with acquisitions_data

# Check for null values
acquisitions_data.isnull().sum()

# Remove columns that are not needed
acquisitions_data = acquisitions_data.drop(columns=["source_url", "source_description", "created_at", "updated_at"])
# There are 7652 null values in term_code and 9562 total rows, lets see the values in this column
acquisitions_data["term_code"].value_counts()
# From the output, we can see that term_code provides information about the type of acquisition (cash, cash and stock or stock). Since most of the rows have null values, we can drop this column
acquisitions_data = acquisitions_data.drop(columns=["term_code"])

# Check for duplicate rows
acquisitions_data.duplicated().sum()
# Check for duplicate columns
acquisitions_data.columns.duplicated().sum()

# Lets check the data types of the columns
acquisitions_data.dtypes
# Convert the acquired_at column to datetime
acquisitions_data['acquired_at'] = pd.to_datetime(acquisitions_data['acquired_at'], errors='coerce')

acquisitions_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9562 entries, 0 to 9561
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   id                   9562 non-null   int64         
 1   acquisition_id       9562 non-null   int64         
 2   acquiring_object_id  9562 non-null   object        
 3   acquired_object_id   9561 non-null   object        
 4   price_amount         9562 non-null   float64       
 5   price_currency_code  9558 non-null   object        
 6   acquired_at          9533 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 523.0+ KB

# Let's remove the max price amount row as it seems to be false data
# It's been listed for 2.6 trillion dollars, which is not possible. 
max_price_index = acquisitions_data['price_amount'].idxmax()
acquisitions_data = acquisitions_data.drop(index=max_price_index)

# Let's do the cleaning for the other dataframes. 
degrees_data.isnull().sum()

degrees_data.duplicated().sum()
degrees_data.columns.duplicated().sum()

degrees_data.shape
# Let's remove the rows where degree_type, subject and instituation, graduated_at are null at the same time
degrees_data = degrees_data.dropna(subset=['degree_type', 'subject', 'institution', 'graduated_at'], how='all')
degrees_data = degrees_data.drop(columns=["updated_at", "created_at"])

degrees_data['graduated_at'] = pd.to_datetime(degrees_data['graduated_at'], errors='coerce')

degrees_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 109607 entries, 0 to 109609
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   id            109607 non-null  int64         
 1   object_id     109607 non-null  object        
 2   degree_type   98389 non-null   object        
 3   subject       81298 non-null   object        
 4   institution   109555 non-null  object        
 5   graduated_at  58054 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 5.9+ MB

def normalize_column_lower(df, columns):
    """
    Normalizes specified columns in a DataFrame to lowercase.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the columns to normalize.
    columns (list): A list of column names to convert to lowercase.

    Returns:
    pd.DataFrame: The DataFrame with specified columns normalized to lowercase.
    """
    # Apply str.lower() to each specified column
    df[columns] = df[columns].applymap(lambda x: x.lower() if isinstance(x, str) else x)
    return df

degrees_data.degree_type.unique()
# There are 7148 unique values in degree_type, which is a lot. Lets try to normalize the values.
normalize_column_lower(degrees_data, ["degree_type", "subject", "institution"])

mappings_for_degree_type = {
    "bachelors of science": "bs",
    "bachelor's of science": "bs",
    "bachelor of science": "bs",
    "b.sc": "bs",
    "bachelor of arts": "ba",
    "master of science": "ms",
    "master's of science": "ms",
    "m.sc": "ms",
    "ms.c": "ms",
    "master of arts": "ma",
    "master's of arts": "ma",
    "m.a": "ma",
    "masters in business administration": "mba",
    "master's in business administration": "mba",
    "m.b.a": "mba",
    "doctor of philosophy": "phd",
}

degrees_data['degree_type'] = degrees_data['degree_type'].replace(mappings_for_degree_type)

degrees_data.degree_type.unique()

array(['mba', 'ba', 'ms', ..., 'masters in applied finance',
       'master of public health', "master's of music"],
      shape=(6741,), dtype=object)

funding_rounds_data.duplicated().sum()
funding_rounds_data.columns.duplicated().sum()

funding_rounds_data = funding_rounds_data.drop(columns=["source_url", "source_description", "created_by" ,"created_at", "updated_at"])
funding_rounds_data['funded_at'] = pd.to_datetime(funding_rounds_data['funded_at'], errors='coerce')

# Let's normalize the funding_round_type and funding_round_code_type
normalize_column_lower(funding_rounds_data, ["funding_round_type"])

funding_rounds_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52928 entries, 0 to 52927
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   id                        52928 non-null  int64         
 1   funding_round_id          52928 non-null  int64         
 2   object_id                 52928 non-null  object        
 3   funded_at                 52680 non-null  datetime64[ns]
 4   funding_round_type        52928 non-null  object        
 5   funding_round_code        52928 non-null  object        
 6   raised_amount_usd         52928 non-null  float64       
 7   raised_amount             52928 non-null  float64       
 8   raised_currency_code      49862 non-null  object        
 9   pre_money_valuation_usd   52928 non-null  float64       
 10  pre_money_valuation       52928 non-null  float64       
 11  pre_money_currency_code   26883 non-null  object        
 12  post_money_valuation_usd  52928 non-null  float64       
 13  post_money_valuation      52928 non-null  float64       
 14  post_money_currency_code  30448 non-null  object        
 15  participants              52928 non-null  int64         
 16  is_first_round            52928 non-null  int64         
 17  is_last_round             52928 non-null  int64         
dtypes: datetime64[ns](1), float64(6), int64(5), object(6)
memory usage: 7.3+ MB

funds_data.duplicated().sum()
funds_data.columns.duplicated().sum()

funds_data = funds_data.drop(columns=["source_url", "source_description", "created_at", "updated_at"])

funds_data[funds_data["funded_at"].isnull()]

# Convert the funded_at column to datetime
funds_data['funded_at'] = pd.to_datetime(funds_data['funded_at'], errors='coerce')

funds_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1564 entries, 0 to 1563
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    1564 non-null   int64         
 1   fund_id               1564 non-null   int64         
 2   object_id             1564 non-null   object        
 3   name                  1564 non-null   object        
 4   funded_at             1449 non-null   datetime64[ns]
 5   raised_amount         1564 non-null   float64       
 6   raised_currency_code  1564 non-null   object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 85.7+ KB

investments_data.duplicated().sum()
investments_data.columns.duplicated().sum()

investments_data = investments_data.drop(columns=[ "created_at", "updated_at"])

investments_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80902 entries, 0 to 80901
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  80902 non-null  int64 
 1   funding_round_id    80902 non-null  int64 
 2   funded_object_id    80902 non-null  object
 3   investor_object_id  80902 non-null  object
dtypes: int64(2), object(2)
memory usage: 2.5+ MB

ipos_data.duplicated().sum()
ipos_data.columns.duplicated().sum()

ipos_data = ipos_data.drop(columns=["source_url", "source_description", "created_at", "updated_at"])

# Change public_at to datetime
ipos_data['public_at'] = pd.to_datetime(ipos_data['public_at'], errors='coerce')

ipos_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1259 entries, 0 to 1258
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   id                       1259 non-null   int64         
 1   ipo_id                   1259 non-null   int64         
 2   object_id                1254 non-null   object        
 3   valuation_amount         1259 non-null   float64       
 4   valuation_currency_code  1257 non-null   object        
 5   raised_amount            1259 non-null   float64       
 6   raised_currency_code     699 non-null    object        
 7   public_at                659 non-null    datetime64[ns]
 8   stock_symbol             1259 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 88.6+ KB

# We should also remove the rows where valuation_amount is 0 or raised_amount is 0 as it is not possible to go public for 0
ipos_data = ipos_data[ipos_data.valuation_amount != 0]
ipos_data = ipos_data[ipos_data.raised_amount != 0]

# Let's add a column to the ipos_data dataframe which is the difference between valuation_amount and raised_amount
ipos_data['difference'] = ipos_data.valuation_amount - ipos_data.raised_amount

# Check for duplicate rows and columns
milestones_data.duplicated().sum()
milestones_data.columns.duplicated().sum()

# convert milestone_at to datetime
milestones_data['milestone_at'] = pd.to_datetime(milestones_data['milestone_at'], errors='coerce')

milestones_data = milestones_data.drop(columns=["source_url", "source_description", "created_at", "updated_at"])

milestones_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39456 entries, 0 to 39455
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id              39456 non-null  int64         
 1   object_id       39456 non-null  object        
 2   milestone_at    39456 non-null  datetime64[ns]
 3   milestone_code  39456 non-null  object        
 4   description     39456 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(3)
memory usage: 1.5+ MB

# Check for duplicate rows and columns
objects_data.duplicated().sum()
objects_data.columns.duplicated().sum()

# Columns to drop: created_by, created_at, updated_at, homepage_url, twitter_username, logo_url, logo_width, logo_height, description, short_description, overview, permalink
objects_data = objects_data.drop(columns=["created_by", "created_at", "updated_at", "homepage_url", "twitter_username", "logo_url", "logo_width", "logo_height","description", "short_description", "overview", "permalink"])

# Convert founded_at, closed_at, first_investment_at, last_investment_at, first_funding_at, last_funding_at, first_milestone_at, last_milestone_at to datetime
date_columns = [
    'founded_at', 'closed_at', 'first_investment_at', 'last_investment_at',
    'first_funding_at', 'last_funding_at', 'first_milestone_at', 'last_milestone_at',
]

objects_data[date_columns] = objects_data[date_columns].apply(pd.to_datetime, errors='coerce')

# Normalize the category_code to lowercase
normalize_column_lower(objects_data, ["category_code"])

# Calculate the time difference in days
objects_data['time_difference'] = (objects_data['closed_at'] - objects_data['founded_at']).dt.days

# Identify rows with negative time differences
negative_diff = objects_data['time_difference'] < 0

# Switch the dates for negative differences
objects_data.loc[negative_diff, ['founded_at', 'closed_at']] = objects_data.loc[negative_diff, ['closed_at', 'founded_at']].values

# Recalculate the time difference to ensure it's positive
objects_data['time_difference'] = (objects_data['closed_at'] - objects_data['founded_at']).dt.days

def calculate_age(row):
    if pd.isnull(row['founded_at']):
        return None       
    # For operating companies with no closed_at date, use current date
    if pd.isnull(row['closed_at']) and (row['status'] not in ['acquired', 'closed']):
        return (pd.Timestamp.now() - pd.to_datetime(row['founded_at'])).days
    elif not pd.isnull(row['closed_at']):
        return (pd.to_datetime(row['closed_at']) - pd.to_datetime(row['founded_at'])).days
    return None

objects_data['time_difference'] = objects_data.apply(calculate_age, axis=1)

objects_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 462651 entries, 0 to 462650
Data columns (total 29 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   id                   462651 non-null  object        
 1   entity_type          462651 non-null  object        
 2   entity_id            462651 non-null  int64         
 3   parent_id            27715 non-null   object        
 4   name                 462647 non-null  object        
 5   normalized_name      462620 non-null  object        
 6   category_code        123186 non-null  object        
 7   status               462651 non-null  object        
 8   founded_at           100441 non-null  datetime64[ns]
 9   closed_at            2809 non-null    datetime64[ns]
 10  domain               174942 non-null  object        
 11  tag_list             106496 non-null  object        
 12  country_code         95043 non-null   object        
 13  state_code           54760 non-null   object        
 14  city                 90684 non-null   object        
 15  region               462651 non-null  object        
 16  first_investment_at  16956 non-null   datetime64[ns]
 17  last_investment_at   16956 non-null   datetime64[ns]
 18  investment_rounds    462651 non-null  int64         
 19  invested_companies   462651 non-null  int64         
 20  first_funding_at     31507 non-null   datetime64[ns]
 21  last_funding_at      31507 non-null   datetime64[ns]
 22  funding_rounds       462651 non-null  int64         
 23  funding_total_usd    462651 non-null  float64       
 24  first_milestone_at   100358 non-null  datetime64[ns]
 25  last_milestone_at    100358 non-null  datetime64[ns]
 26  milestones           462651 non-null  int64         
 27  relationships        462651 non-null  int64         
 28  time_difference      95710 non-null   float64       
dtypes: datetime64[ns](8), float64(2), int64(6), object(13)
memory usage: 102.4+ MB

# Check for duplicate rows and columns
offices_data.duplicated().sum()
offices_data.columns.duplicated().sum()

# drop created_at and updated_at
offices_data = offices_data.drop(columns=["created_at", "updated_at"])

# remove rows where latitude and longitude are null
offices_data = offices_data.dropna(subset=['latitude', 'longitude'])

offices_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112718 entries, 0 to 112717
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            112718 non-null  int64  
 1   object_id     112718 non-null  object 
 2   office_id     112718 non-null  int64  
 3   description   68530 non-null   object 
 4   region        112718 non-null  object 
 5   address1      94430 non-null   object 
 6   address2      44520 non-null   object 
 7   city          107550 non-null  object 
 8   zip_code      93230 non-null   object 
 9   state_code    62017 non-null   object 
 10  country_code  112718 non-null  object 
 11  latitude      112718 non-null  float64
 12  longitude     112718 non-null  float64
dtypes: float64(2), int64(2), object(9)
memory usage: 11.2+ MB

people_data.duplicated().sum()
people_data.columns.duplicated().sum()

people_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226709 entries, 0 to 226708
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   id                226709 non-null  int64 
 1   object_id         226709 non-null  object
 2   first_name        226700 non-null  object
 3   last_name         226705 non-null  object
 4   birthplace        28084 non-null   object
 5   affiliation_name  226684 non-null  object
dtypes: int64(1), object(5)
memory usage: 10.4+ MB

relationships_data.duplicated().sum()
relationships_data.columns.duplicated().sum()

# drop created_at and updated_at
relationships_data = relationships_data.drop(columns=["created_at", "updated_at"])

# convert start_at and end_at to datetime
relationships_data['start_at'] = pd.to_datetime(relationships_data['start_at'], errors='coerce')
relationships_data['end_at'] = pd.to_datetime(relationships_data['end_at'], errors='coerce')

relationships_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402878 entries, 0 to 402877
Data columns (total 9 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   id                      402878 non-null  int64         
 1   relationship_id         402878 non-null  int64         
 2   person_object_id        402878 non-null  object        
 3   relationship_object_id  402878 non-null  object        
 4   start_at                206995 non-null  datetime64[ns]
 5   end_at                  101046 non-null  datetime64[ns]
 6   is_past                 402878 non-null  int64         
 7   sequence                402878 non-null  int64         
 8   title                   389526 non-null  object        
dtypes: datetime64[ns](2), int64(4), object(3)
memory usage: 27.7+ MB

# Import seaborn and matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

# Set the style
sns.set_style("whitegrid")
# Set the figure size
plt.figure(figsize=(10, 6))

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

sns.countplot(y='status', data=objects_data, order=objects_data['status'].value_counts().index)
plt.xscale('log')
plt.title('Distribution of Startup Statuses (Log Scale)')
plt.xlabel('Frequency (Log Scale)')
plt.ylabel('Status')
plt.show()

# Let's try merging the objects_data and acquisitions_data to find out which startups got acquired for how much.

# First, let's merge the dataframes on the object_id column and we want to keep all the rows from the objects_data 
m1_df = pd.merge(objects_data, acquisitions_data, left_on='id', right_on='acquired_object_id', how='left')

objects_data.status.value_counts()

status
operating      443663
acquired         9394
live             4349
closed           2773
ipo              1134
beta              780
development       226
private           219
alpha             113
Name: count, dtype: int64

# Let's see the distribution of the price_currency_code for acquired startups
v1_m1_df = m1_df[m1_df.status == 'acquired']

sns.countplot(y='price_currency_code', data=v1_m1_df, order=v1_m1_df['price_currency_code'].value_counts().index)
plt.xscale('log')
plt.title('Distribution of Acquisition Prices (Log Scale)')
plt.xlabel('Frequency (Log Scale)')
plt.ylabel('Price Currency Code')
plt.show()

# Let's find out which startup got acquired for how much and exclude startups that got acquired for 0, since we don't have the data for how much they were acquired for. 
# Also, lets only consider startups that were acquired in USD
v2_m1_df = m1_df[(m1_df.status == 'acquired') & (m1_df.price_amount != 0) & (m1_df.price_currency_code == 'USD')]
sns.histplot(v2_m1_df.price_amount, bins=30, kde=True)
plt.xscale('log')
plt.title('Distribution of Acquisition Prices (Log Scale)')
plt.xlabel('Price Amount in USD (Log Scale)')
plt.ylabel('Frequency')
plt.show()

# Now we shall merge objects_data and ipos_data to find out which startups went public for how much. 
m2_df = pd.merge(objects_data, ipos_data, left_on='id', right_on='object_id', how='inner')

v1_m2_df = m2_df[(m2_df.status == 'ipo') & ((m2_df.valuation_currency_code == 'USD') | (m2_df.raised_currency_code == 'USD'))]
plt.figure(figsize=(10, 6))
sns.scatterplot(data=v1_m2_df, x='raised_amount', y='valuation_amount', alpha=0.5)
plt.xscale('log')
plt.yscale('log')
plt.title('Valuation vs Raised Amount')
plt.xlabel('Raised Amount (USD, Log Scale)')
plt.ylabel('Valuation Amount (USD, Log Scale)')
# Add diagonal line for reference
plt.plot([1, 1e12], [1, 1e12], 'r--', alpha=0.5)
plt.show()

plt.figure(figsize=(12, 7))  # Larger figure size for better readability
sns.violinplot(y=v1_m2_df.difference, cut=0, color='darkblue', alpha=0.7)
plt.yscale('log')
plt.ylim(1e8, 1e11)  # Adjust y-limits to focus on the meaningful range
plt.title('Distribution of IPO Valuation-Raised Difference (USD)', pad=20, fontsize=12)
plt.ylabel('Difference in USD (Log Scale)', fontsize=10)

# Add grid for better readability
plt.grid(True, alpha=0.3, which='both')

# Add median and mean lines
median = v1_m2_df.difference.median()
mean = v1_m2_df.difference.mean()
plt.axhline(y=median, color='red', linestyle='--', alpha=0.5, label=f'Median: ${median:,.0f}')
plt.axhline(y=mean, color='green', linestyle='--', alpha=0.5, label=f'Mean: ${mean:,.0f}')

plt.legend()
plt.show()

# Filter the DataFrame for startups that have a 'closed_at' date
closed_startups = objects_data.dropna(subset=['closed_at'])

# Now we check for status of the closed_startups
closed_startups.status.value_counts()

average_lifespan = closed_startups.groupby('status')['time_difference'].mean()

# Plot the average lifespan
plt.figure(figsize=(8, 6))
average_lifespan.plot(kind='bar', color=['green', 'red'], grid=False)
plt.title('Average Lifespan of Startups by Status')
plt.xlabel('Status')
plt.ylabel('Average Lifespan (Days)')
plt.xticks(rotation=0)
plt.show()

plt.figure(figsize=(10, 6))
sns.violinplot(x='status', y='time_difference', data=closed_startups, hue='status', palette='muted', legend=False)
plt.title('Lifespan Distribution of Startups by Status')
plt.xlabel('Status')
plt.ylabel('Lifespan (Days)')
plt.show()

# Import necessary libraries
from lifelines import KaplanMeierFitter, CoxPHFitter
import numpy as np

m3_df = pd.merge(objects_data, funding_rounds_data, left_on='id', right_on='object_id', how='left')

# Create event column: 1 if closed, acquired, or IPO, else 0
m3_df['event'] = m3_df['status'].apply(lambda x: 1 if x in ['closed'] else 0)

# Create duration column in days
m3_df['duration'] = m3_df["time_difference"]
m3_df['duration'] = m3_df['duration'].fillna((pd.Timestamp.today() - m3_df['founded_at']).dt.days)

# Drop rows with negative duration
m3_df = m3_df[m3_df['duration'] >= 0]

# Kaplan-Meier Survival Estimate
kmf = KaplanMeierFitter()
kmf.fit(durations=m3_df['duration'], event_observed=m3_df['event'])
kmf.plot_survival_function()
plt.title('Kaplan-Meier Survival Estimate')
plt.xlabel('Time (days)')
plt.ylabel('Survival Probability')
plt.show()

# Cox Proportional Hazards Model
cph = CoxPHFitter()
selected_columns = ['funding_total_usd', 'investment_rounds', 'funding_rounds', 'category_code']
cox_df = m3_df[['duration', 'event'] + selected_columns].dropna()

# One-hot encode categorical variables
cox_df = pd.get_dummies(cox_df, columns=['category_code'], drop_first=True)

cph.fit(cox_df, duration_col='duration', event_col='event')
cph.print_summary()

# Convert duration and event to numeric explicitly and drop NaNs
m3_df['duration'] = pd.to_numeric(m3_df['duration'], errors='coerce')
m3_df['event'] = pd.to_numeric(m3_df['event'], errors='coerce')
m3_df.dropna(subset=['duration', 'event'], inplace=True)

# Remove infinite values if any
m3_df = m3_df[~m3_df['duration'].isin([np.inf, -np.inf])]

plt.figure(figsize=(10, 6))
for category in m3_df['category_code'].unique():
    category_df = m3_df[m3_df['category_code'] == category]
    # Check if the DataFrame is not empty and has sufficient data
    if not category_df.empty and category_df['duration'].notnull().all() and category_df['event'].notnull().all():
        kmf = KaplanMeierFitter()
        kmf.fit(durations=category_df['duration'], event_observed=category_df['event'], label=category)
        kmf.plot_survival_function()

plt.title('Kaplan-Meier Survival Estimate by Category')
plt.xlabel('Time (days)')
plt.ylabel('Survival Probability')
plt.legend(title='Category')
plt.show()

# Define the category mappings
category_mappings = {
    'web': 'Tech', 'mobile': 'Tech', 'software': 'Tech', 'network_hosting': 'Tech', 'hardware': 'Tech', 'semiconductor': 'Tech', 'security': 'Tech',
    'biotech': 'Life Sciences', 'health': 'Life Sciences', 'medical': 'Life Sciences',
    'games_video': 'Media and Entertainment', 'photo_video': 'Media and Entertainment', 'music': 'Media and Entertainment', 'social': 'Media and Entertainment', 'messaging': 'Media and Entertainment',
    'education': 'Services', 'consulting': 'Services', 'legal': 'Services', 'real_estate': 'Services',
    'ecommerce': 'Consumer Goods and Retail', 'fashion': 'Consumer Goods and Retail', 'design': 'Consumer Goods and Retail',
    'manufacturing': 'Industrial and Energy', 'cleantech': 'Industrial and Energy', 'automotive': 'Industrial and Energy'
}

# Map the categories to broader groups
m3_df['grouped_category'] = m3_df['category_code'].map(category_mappings).fillna('Other')

# Plotting
plt.figure(figsize=(12, 8))
kmf = KaplanMeierFitter()

for group in m3_df['grouped_category'].unique():
    group_df = m3_df[m3_df['grouped_category'] == group]
    kmf.fit(durations=group_df['duration'], event_observed=group_df['event'], label=group)
    kmf.plot_survival_function()

plt.title('Kaplan-Meier Survival Estimate by Grouped Categories')
plt.xlabel('Time (days)')
plt.ylabel('Survival Probability')
plt.legend(title='Grouped Category', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

m3_df_sorted = m3_df.sort_values(['object_id', 'funded_at'])
m3_df_sorted['time_between_rounds'] = m3_df_sorted.groupby('object_id')['funded_at'].diff().dt.days

# There is a lot of data where time between rounds is 0, since this is unlikely we will remove it from the visualization
m3_df_sorted = m3_df_sorted[m3_df_sorted['time_between_rounds'] > 0]

# The data is heavily skewed so
# Log scale transformation
# Adding a small constant because log(0) is undefined
log_time_between_rounds = np.log1p(m3_df_sorted['time_between_rounds'])


plt.figure(figsize=(10, 6))
sns.histplot(log_time_between_rounds, bins=50, kde=True)
plt.title('Log-Scaled Distribution of Time Between Funding Rounds')
plt.xlabel('Log of Time Between Rounds (days)')
plt.ylabel('Frequency')
plt.show()

# Count the number of funding rounds per company
funding_counts = m3_df_sorted.groupby('object_id').size().reset_index(name='num_rounds')

# Merge with exit status
exit_info = m3_df_sorted[['object_id', 'status']].drop_duplicates()
funding_exit_data = funding_counts.merge(exit_info, on='object_id', how='left')

# Visualize the distribution of funding rounds before exit
plt.figure(figsize=(10, 6))
sns.countplot(data=funding_exit_data, x='num_rounds', hue='status', palette='Set2')  # Using a different color palette for better visibility
plt.title('Number of Funding Rounds by Company Exit Status')
plt.xlabel('Number of Funding Rounds per Company')
plt.ylabel('Number of Companies')
plt.xlim(0, 10)
plt.legend(title='Exit Status', loc='upper right')
plt.show()

# Filter to get only founders
founders_data = relationships_data[relationships_data['title'].str.contains('founder', case=False, na=False)]

# Count the number of founders per company
founder_counts = founders_data.groupby('relationship_object_id').size().reset_index(name='num_founders')

# Filter to include only companies with 4 or fewer founders
founder_counts = founder_counts[founder_counts['num_founders'] <= 8]

# Merge with company status
company_success = objects_data[['id', 'status']]
founder_success = founder_counts.merge(company_success, left_on='relationship_object_id', right_on='id', how='left')

# Visualize the impact of number of founders on success using a point plot with updated parameters
plt.figure(figsize=(10, 6))
sns.pointplot(data=founder_success, x='status', y='num_founders', linestyle='none')
plt.title('Impact of Number of Founders on Company Success (Up to 4 Founders)')
plt.xlabel('Company Status')
plt.ylabel('Number of Founders')
plt.show()

# Let's create a more comprehensive mapping that categorizes degrees into main categories
extended_degree_mapping = {
    # Existing mappings
    "bs": "Bachelor",
    "ba": "Bachelor",
    "ms": "Master",
    "ma": "Master",
    "mba": "MBA",
    "phd": "PhD",
    
    # Additional mappings for similar degrees
    "master of": "Master",
    "master's of": "Master",
    "master in": "Master",
    "master's in": "Master",
    "bachelor of": "Bachelor",
    "bachelor's of": "Bachelor",
    "bachelor in": "Bachelor",
    "doctor of": "PhD",
    "doctorate": "PhD"
}

# Function to categorize degrees
def categorize_degree(degree_type):
    if pd.isna(degree_type):
        return "Unknown"
    degree_lower = str(degree_type).lower()
    for key, value in extended_degree_mapping.items():
        if key in degree_lower:
            return value
    return "Other"

degrees_data['degree_category'] = degrees_data['degree_type'].apply(categorize_degree)

# Let's consider operating, acquired and ipo status as successful
company_success = objects_data[objects_data['status'].isin(['operating', 'acquired', 'ipo'])][['id', 'status']]
degree_success = degrees_data.merge(company_success, left_on='object_id', right_on='id', how='left')

plt.figure(figsize=(12, 6))
degree_counts = pd.crosstab(degree_success['degree_category'], degree_success['status'], normalize='columns') * 100
degree_counts.plot(kind='bar', stacked=False)
plt.title('Distribution of Degree Types Across Successful Startups')
plt.xlabel('Degree Type')
plt.ylabel('Percentage (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

<Figure size 1200x600 with 0 Axes>

# Filter for successful companies and get industry distribution
successful_companies = objects_data[objects_data['status'].isin(['operating', 'acquired', 'ipo'])]

# Calculate percentage distribution of industries
industry_dist = successful_companies['category_code'].value_counts(normalize=True) * 100

# Create bar plot
plt.figure(figsize=(12, 6))
sns.barplot(x=industry_dist.index, y=industry_dist.values)
plt.title('Distribution of Industries Among Successful Startups')
plt.xlabel('Industry Category')
plt.ylabel('Percentage (%)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

successful_founders = relationships_data[
    (relationships_data['relationship_object_id'].isin(successful_companies['id'])) & 
    (relationships_data['title'].str.contains('founder', case=False, na=False))
]

# Get the education data for these founders
founder_education = degrees_data.merge(
    successful_founders['person_object_id'], 
    left_on='object_id', 
    right_on='person_object_id', 
    how='inner'
)

# Count number of successful founders by institution
institution_success = founder_education['institution'].value_counts().head(20)  # Top 20 institutions

# Create a bar plot
plt.figure(figsize=(15, 8))
sns.barplot(x=institution_success.values, y=institution_success.index)
plt.title('Top 20 Institutions by Number of Successful Startup Founders')
plt.xlabel('Number of Successful Founders')
plt.ylabel('Institution')
plt.tight_layout()
plt.show()

import folium
from folium.plugins import HeatMap
import branca.colormap as cm

# Merge office data with filtered successful companies
geo_success = offices_data.merge(successful_companies[['id', 'status']], 
                              left_on='object_id', 
                              right_on='id', 
                              how='inner')  # using inner join to keep only successful companies

# 1. Success rates by country
country_success = geo_success['country_code'].value_counts().head(10)

plt.figure(figsize=(12, 6))
sns.barplot(x=country_success.index, y=country_success.values)
plt.title('Top 10 Countries by Number of Successful Startups')
plt.xlabel('Country')
plt.ylabel('Number of Successful Startups')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

successful_companies = objects_data[objects_data['status'].isin(['operating', 'acquired', 'ipo'])]
geo_success = offices_data.merge(successful_companies[['id', 'status']], 
                              left_on='object_id', 
                              right_on='id', 
                              how='inner')

# Drop rows with missing coordinates
geo_success = geo_success.dropna(subset=['latitude', 'longitude'])

# Create a list of coordinates and their weights (we'll use 1 for each startup)
locations = geo_success[['latitude', 'longitude']].values.tolist()
weights = [1] * len(locations)

# Create a base map centered on the mean coordinates
center_lat = geo_success['latitude'].mean()
center_lon = geo_success['longitude'].mean()
startup_map = folium.Map(location=[center_lat, center_lon], 
                        zoom_start=2,
                        tiles='CartoDB positron')
# Add the heatmap layer
HeatMap(list(zip(geo_success['latitude'], 
                 geo_success['longitude'], 
                 weights)),
        radius=15,
        blur=10,
        max_zoom=13).add_to(startup_map)

# Save the map
startup_map.save('startup_heatmap.html')

# Create a new map for showing all startups by status
geo_all = offices_data.merge(objects_data[['id', 'status']], 
                           left_on='object_id', 
                           right_on='id', 
                           how='inner')

# Drop rows with missing coordinates
geo_all = geo_all.dropna(subset=['latitude', 'longitude'])

# Define colors for each status
status_colors = {
    'operating': 'red',
    'acquired': 'blue',
    'closed': 'gray',
    'ipo': 'green',
    'live': 'orange',
    'beta': 'purple',
    'private': 'pink',
    'alpha': 'brown',
    'development': 'yellow'
}

# Create a new map
center_lat = geo_all['latitude'].mean()
center_lon = geo_all['longitude'].mean()
status_map = folium.Map(location=[center_lat, center_lon], 
                       zoom_start=2,
                       tiles='CartoDB positron')

# Create feature groups for each status
feature_groups = {status: folium.FeatureGroup(name=status) for status in status_colors.keys()}

# Add circles for each startup
for idx, row in geo_all.iterrows():
    if row['status'] in status_colors:
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=3,
            color=status_colors[row['status']],
            fill=True,
            popup=f"Status: {row['status']}",
            opacity=0.7,
            fill_opacity=0.7
        ).add_to(feature_groups[row['status']])

# Add all feature groups to map
for fg in feature_groups.values():
    fg.add_to(status_map)

# Add layer control to toggle different status layers
folium.LayerControl().add_to(status_map)

# Save the map
status_map.save('startup_status_map.html')

us_companies = geo_success[geo_success['country_code'] == 'USA']
state_success = us_companies['state_code'].value_counts().head(10)
city_success = geo_success['city'].value_counts().head(10)

plt.figure(figsize=(20, 6))

# First subplot for states
plt.subplot(1, 2, 1)
sns.barplot(x=state_success.index, y=state_success.values)
plt.title('Top 10 US States by Number of Successful Startups')
plt.xlabel('State')
plt.ylabel('Number of Successful Startups')
plt.xticks(rotation=45)

# Second subplot for cities
plt.subplot(1, 2, 2)
sns.barplot(x=city_success.index, y=city_success.values)
plt.title('Top 10 Cities by Number of Successful Startups')
plt.xlabel('City')
plt.ylabel('Number of Successful Startups')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Merge investments with successful companies
successful_investments = investments_data.merge(
    successful_companies[['id', 'status']], 
    left_on='funded_object_id', 
    right_on='id', 
    how='inner'
)

# Count successful investments per investor
investor_success = successful_investments['investor_object_id'].value_counts().reset_index()
investor_success.columns = ['investor_id', 'successful_investments']

# Get total investments per investor
total_investments = investments_data['investor_object_id'].value_counts().reset_index()
total_investments.columns = ['investor_id', 'total_investments']

# Combine successful and total investments
investor_stats = investor_success.merge(total_investments, on='investor_id', how='right')
investor_stats['success_rate'] = investor_stats['successful_investments'] / investor_stats['total_investments']

# Merge with objects_data to get investor names
investor_stats = investor_stats.merge(
    objects_data[['id', 'name']], 
    left_on='investor_id', 
    right_on='id', 
    how='left'
)

# Sort by number of successful investments
top_investors = investor_stats.sort_values('successful_investments', ascending=False).head(20)

# Create visualizations
plt.figure(figsize=(20, 6))

# Plot successful investments
plt.subplot(1, 2, 1)
plt.bar(range(len(top_investors)), top_investors['successful_investments'])
plt.title('Top Investors by Successful Exits')
plt.xlabel('Investor')
plt.ylabel('Number of Successful Investments')
plt.xticks(range(len(top_investors)), top_investors['name'], rotation=45, ha='right')

# Plot success rate
plt.subplot(1, 2, 2)
plt.bar(range(len(top_investors)), top_investors['success_rate'])
plt.title('Investment Success Rate')
plt.xlabel('Investor')
plt.ylabel('Success Rate')
plt.xticks(range(len(top_investors)), top_investors['name'], rotation=45, ha='right')

plt.tight_layout()
plt.show()

# Time-based analysis
# Analyze funding patterns over time
# First aggregate the data by year
yearly_funding = funds_data.groupby(funds_data['funded_at'].dt.year)['raised_amount'].sum().reset_index()

# Create the plot
plt.figure(figsize=(12, 6))
plt.bar(yearly_funding['funded_at'], 
        yearly_funding['raised_amount'], 
        color='#2E86C1')

# Formatting
plt.title('Total Funding Amount by Year', fontsize=14)
plt.xlabel('Year', fontsize=12)
plt.ylabel('Total Raised Amount (Billions USD)', fontsize=12)

# Format x-axis
plt.xticks(rotation=45)

# Format y-axis to billions
plt.gca().yaxis.set_major_formatter(lambda x, p: f'${x/1e9:.1f}B')

# Add grid
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()

# First merge investments with funding rounds to get actual dates
investments_with_dates = investments_data.merge(
    funding_rounds_data[['id', 'funded_at']], 
    left_on='funding_round_id', 
    right_on='id', 
    how='left'
)

# Get successful investments with dates
successful_investments_dated = investments_with_dates.merge(
    successful_companies[['id', 'status']], 
    left_on='funded_object_id', 
    right_on='id', 
    how='inner'
)

# Yearly trends
yearly_investments = investments_with_dates.groupby(investments_with_dates['funded_at'].dt.year).size()
yearly_successful = successful_investments_dated.groupby(successful_investments_dated['funded_at'].dt.year).size()
yearly_success_rate = yearly_successful / yearly_investments

# Investment Trends
plt.figure(figsize=(12, 6))
ax1 = sns.barplot(x=yearly_investments.index, y=yearly_investments.values)
plt.title('Total Investments by Year', fontsize=12, pad=15)
plt.xlabel('Year', fontsize=10)
plt.ylabel('Number of Investments', fontsize=10)
plt.xticks(rotation=45, ha='right')

# Add value labels on bars
for i, v in enumerate(yearly_investments.values):
    ax1.text(i, v, str(int(v)), ha='center', va='bottom')

plt.tight_layout()
plt.show();

# Success Rate Trends 
plt.figure(figsize=(14, 6)) 
plt.subplots_adjust(left=0.1, right=0.9, bottom=0.15)

ax2 = sns.lineplot(x=yearly_success_rate.index, y=yearly_success_rate.values, marker='o')
plt.title('Investment Success Rate Over Time', fontsize=12, pad=15)
plt.xlabel('Year', fontsize=10)
plt.ylabel('Success Rate', fontsize=10)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45, ha='right')
plt.show();

# We will use scikit-learn to implment our model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_curve,precision_recall_curve, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor
from collections import Counter

# Function to label data
# Create binary target (1 for success, 0 for failure)
threshold = 1e5
min_age = 365
def get_success_label(row):
    if row['status'] == 'ipo':
        return 1
    elif row['status'] == 'acquired':
         if pd.notnull(row['acquisition_price']) and row['acquisition_price'] > threshold:
            return 1
    elif row['status'] == 'operating' and row['time_difference'] > min_age:
        return 1
    return 0

# First, get acquisition prices for companies
acquisition_prices = acquisitions_data.groupby('acquired_object_id')['price_amount'].max().reset_index()
acquisition_prices.columns = ['id', 'acquisition_price']
objects_data = objects_data.merge(
    acquisition_prices, 
    on='id', 
    how='left'
)

# Apply it to our dataframe to create a new column
objects_data['success'] = objects_data.apply(get_success_label, axis=1)

# Initial features from objects_data
columns_to_copy = [
    "id", 
    "time_difference", 
    "category_code", 
    "country_code",
    "funding_rounds",
    "funding_total_usd",
    "investment_rounds",
    "invested_companies",
    "first_funding_at",
    "last_funding_at",
    "first_milestone_at",
    "last_milestone_at",
    "milestones",
    "relationships"
]

rfc_features = objects_data[columns_to_copy].copy()
rfc_features.columns = [
    "object_id", 
    "age",
    "category_code", 
    "country_code",
    "funding_rounds",
    "total_funding",
    "investment_rounds",
    "invested_companies",
    "first_funding_at",
    "last_funding_at",
    "first_milestone_at",
    "last_milestone_at",
    "milestone_count",
    "team_size"
]

# Calculate days to first milestone (since we need the calculation)
rfc_features['days_to_first_milestone'] = (
    pd.to_datetime(rfc_features['first_milestone_at']) - 
    pd.to_datetime(objects_data['founded_at'])
).dt.days

# Calculate average funding (derived feature)
rfc_features['avg_funding'] = rfc_features['total_funding'] / rfc_features['funding_rounds']
rfc_features['avg_funding'] = rfc_features['avg_funding'].fillna(0)
rfc_features['has_funding'] = rfc_features['funding_rounds'] > 0
rfc_features['is_investor'] = rfc_features['invested_companies'] > 0
rfc_features['funding_to_age_ratio'] = rfc_features['total_funding'] / rfc_features['age']
rfc_features['funding_to_age_ratio'] = rfc_features['funding_to_age_ratio'].fillna(0)

# Acquisition features
# Times as acquirer
acquirer_count = acquisitions_data.groupby('acquiring_object_id').size().reset_index()
acquirer_count.columns = ['object_id', 'times_as_acquirer']
rfc_features = rfc_features.merge(acquirer_count, on='object_id', how='left')
# Total acquisition spend
acquisition_spend = acquisitions_data.groupby('acquiring_object_id').agg({
    'price_amount': 'sum'
}).reset_index()
acquisition_spend.columns = ['object_id', 'total_acquisition_spend']
rfc_features = rfc_features.merge(acquisition_spend, on='object_id', how='left')
# Fill NaN values with 0 for new features
acquisition_columns = ['times_as_acquirer', 'total_acquisition_spend',]
rfc_features[acquisition_columns] = rfc_features[acquisition_columns].fillna(0)

# Office Features
office_count = offices_data.groupby('object_id').size().reset_index()
office_count.columns = ['object_id', 'office_count']
rfc_features = rfc_features.merge(office_count, on='object_id', how='left')
rfc_features['office_count'] = rfc_features['office_count'].fillna(0)
# Add number of countries with offices (international presence)
office_countries = offices_data.groupby('object_id')['country_code'].nunique().reset_index()
office_countries.columns = ['object_id', 'countries_with_offices']
rfc_features = rfc_features.merge(office_countries, on='object_id', how='left')
rfc_features['countries_with_offices'] = rfc_features['countries_with_offices'].fillna(0)

# We saw earlier that there are a lot of successful startup founders from certain institutions, let's try to test that:
successful_founders = relationships_data[
    (relationships_data['relationship_object_id'].isin(
        objects_data[objects_data['success'] == 1]['id']
    )) & 
    (relationships_data['title'].str.contains('founder', case=False, na=False))
]['person_object_id']
# Get top 50 institutions from successful founders' education
top_50_institutes = (
    degrees_data[degrees_data['object_id'].isin(successful_founders)]
    ['institution']
    .value_counts()
    .nlargest(50)
    .index
    .tolist()
)
rfc_features['top_institute'] = (
    rfc_features['object_id']
    .isin(
        degrees_data[
            (degrees_data['object_id'].isin(successful_founders)) & 
            (degrees_data['institution'].isin(top_50_institutes))
        ]['object_id']
    )
    .astype(int)
)

# Let's check for and handle missing data
rfc_features.isnull().sum()

object_id                       0
age                        366941
category_code              339465
country_code               367608
funding_rounds                  0
total_funding                   0
investment_rounds               0
invested_companies              0
first_funding_at           431144
last_funding_at            431144
first_milestone_at         362293
last_milestone_at          362293
milestone_count                 0
team_size                       0
days_to_first_milestone    417492
avg_funding                     0
has_funding                     0
is_investor                     0
funding_to_age_ratio            0
times_as_acquirer               0
total_acquisition_spend         0
office_count                    0
countries_with_offices          0
top_institute                   0
dtype: int64

# Our dataset has 462,651 and we don't have data for around 90% funding dates and 80% milestone dates so:
# Add binary indicators
rfc_features['has_funding'] = rfc_features['funding_rounds'] > 0
rfc_features['has_milestones'] = rfc_features['milestone_count'] > 0

# Drop date-related columns
columns_to_drop = [
    'first_funding_at', 
    'last_funding_at', 
    'first_milestone_at', 
    'last_milestone_at',
    'days_to_first_milestone'
]
rfc_features = rfc_features.drop(columns=columns_to_drop)

# Categorical features
categorical_features = ['category_code', 'country_code']
for col in categorical_features:
    # Fill with 'unknown' or create a new category for missing values
    rfc_features[col] = rfc_features[col].fillna('unknown')

# Let's save the dataframe as we might use some of the features later which we don't need now
saved_rfc = rfc_features.copy()

# Let's see if we were able to fully deal with null data
rfc_features.isnull().sum()
# We couldn't reduce the null data in age column and since about 80% of the data is missing for age let's remove this column
rfc_features = rfc_features.drop(['age', 'funding_to_age_ratio'], axis=1)

# Let's check our features for collinearity
# Compute the correlation matrix
# First identify numerical features from original dataframe
numerical_features = rfc_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Check collinearity on original numerical data
corr_matrix = rfc_features[numerical_features].corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Let's remove the features that are highly correlated > 0.8
columns_to_drop = ['avg_funding', 'office_count', 'invested_companies']
rfc_features = rfc_features.drop(columns=columns_to_drop)

# Now, let' see the VIF (Variance Inflation Factor)
# It measures how much a feature is correlated with all other features combined
vif_data = pd.DataFrame()
numerical_features = rfc_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
vif_data["feature"] = numerical_features
vif_data["VIF"] = [variance_inflation_factor(rfc_features[numerical_features].values, i)
                   for i in range(len(numerical_features))]

vif_data

# Now let's check for correlation again
corr_matrix = rfc_features[numerical_features].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Prepare the data
# Encode categorical variables
le = LabelEncoder()
categorical_features = ['category_code', 'country_code']
rfc_features_encoded = rfc_features.copy()

for col in categorical_features:
    rfc_features_encoded[col] = le.fit_transform(rfc_features_encoded[col])

# Scale numerical features
numerical_features = ['funding_rounds', 'total_funding', 'investment_rounds',
                     'milestone_count', 'team_size', 'times_as_acquirer',
                     'total_acquisition_spend', 'countries_with_offices', "top_institute"]

scaler = StandardScaler()
rfc_features_encoded[numerical_features] = scaler.fit_transform(rfc_features_encoded[numerical_features])

# Prepare features and target
X = rfc_features_encoded.drop('object_id', axis=1)
y = objects_data['success']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

# Perform cross-validation
cv_scores = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='roc_auc')
print("Cross-validation scores:", cv_scores)
print("Average CV score:", cv_scores.mean())

Cross-validation scores: [0.96200467 0.96195164 0.96255684 0.96146801 0.96172271]
Average CV score: 0.9619407724705891

# Train the final model
rf_model.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', max_depth=15,
                       min_samples_leaf=2, min_samples_split=5, n_jobs=-1,
                       random_state=42)

RandomForestClassifier(class_weight='balanced', max_depth=15,
                       min_samples_leaf=2, min_samples_split=5, n_jobs=-1,
                       random_state=42)

#  Make predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]

# Print comprehensive evaluation metrics
print("\nModel Performance Metrics:")
print("-------------------------")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_pred_proba))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Model Performance Metrics:
-------------------------
Accuracy: 0.8964995515016589
Precision: 0.6638296317942723
Recall: 0.9798350137488543
F1 Score: 0.7914552620691157
ROC AUC Score: 0.9610955888095624

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.88      0.93     73984
           1       0.66      0.98      0.79     18547

    accuracy                           0.90     92531
   macro avg       0.83      0.93      0.86     92531
weighted avg       0.93      0.90      0.90     92531


Confusion Matrix:
[[64781  9203]
 [  374 18173]]

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
feature_importance

plt.bar(feature_importance['feature'][:10], feature_importance['importance'][:10])
plt.xticks(rotation=45, ha='right')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

# Encode categorical variables
le = LabelEncoder()
categorical_features = ['category_code', 'country_code']
xgb_features_encoded = rfc_features.copy()

for col in categorical_features:
    xgb_features_encoded[col] = le.fit_transform(xgb_features_encoded[col])

numerical_features = [
    'funding_rounds', 'total_funding', 'investment_rounds',
    'milestone_count', 'team_size', 'times_as_acquirer',
    'total_acquisition_spend', 'countries_with_offices', "top_institute"
]

scaler = StandardScaler()
xgb_features_encoded[numerical_features] = scaler.fit_transform(xgb_features_encoded[numerical_features])

# Prepare features and target
X = xgb_features_encoded.drop('object_id', axis=1)
y = objects_data['success']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameters
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 15,
    'learning_rate': 0.1,
    'min_child_weight': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'scale_pos_weight': 1,  
    'seed': 42,
}

# Perform cross-validation
cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=100,
    nfold=5,
    stratified=True,
    early_stopping_rounds=10,
    metrics='auc',
    seed=42,
)

cv_results = xgb.cv(
    params=params,
    dtrain=dtrain,
    num_boost_round=100,
    nfold=5,
    stratified=True,
    early_stopping_rounds=10,
    metrics='auc',
    seed=42,
)

print("Cross-validation scores:", cv_results['test-auc-mean'])
print("Best iteration:", cv_results.shape[0])
print("Best CV AUC Score:", cv_results['test-auc-mean'].max())

Cross-validation scores: 0     0.959237
1     0.960876
2     0.960027
3     0.957575
4     0.959172
5     0.957426
6     0.959197
7     0.960099
8     0.959505
9     0.960238
10    0.960764
11    0.961210
12    0.961624
13    0.961866
14    0.962032
15    0.961850
16    0.962002
17    0.962113
18    0.962183
19    0.962057
20    0.962133
21    0.962221
22    0.962156
23    0.962221
24    0.962282
25    0.962223
26    0.962273
27    0.962323
28    0.962345
29    0.962363
30    0.962404
31    0.962447
32    0.962451
33    0.962415
34    0.962423
35    0.962394
36    0.962403
37    0.962420
38    0.962429
39    0.962446
40    0.962455
Name: test-auc-mean, dtype: float64
Best iteration: 41
Best CV AUC Score: 0.9624551748288382

# Train the final model
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=cv_results.shape[0],
    evals=[(dtrain, 'train')],
)

[0]	train-auc:0.96063
[1]	train-auc:0.96225
[2]	train-auc:0.96151
[3]	train-auc:0.95895
[4]	train-auc:0.96081
[5]	train-auc:0.95920
[6]	train-auc:0.96103
[7]	train-auc:0.96214
[8]	train-auc:0.96164
[9]	train-auc:0.96271
[10]	train-auc:0.96346
[11]	train-auc:0.96399
[12]	train-auc:0.96463
[13]	train-auc:0.96512
[14]	train-auc:0.96544
[15]	train-auc:0.96534
[16]	train-auc:0.96555
[17]	train-auc:0.96581
[18]	train-auc:0.96606
[19]	train-auc:0.96599
[20]	train-auc:0.96616
[21]	train-auc:0.96634
[22]	train-auc:0.96627
[23]	train-auc:0.96646
[24]	train-auc:0.96659
[25]	train-auc:0.96649
[26]	train-auc:0.96668
[27]	train-auc:0.96683
[28]	train-auc:0.96697
[29]	train-auc:0.96710
[30]	train-auc:0.96718
[31]	train-auc:0.96727
[32]	train-auc:0.96735
[33]	train-auc:0.96736
[34]	train-auc:0.96744
[35]	train-auc:0.96745
[36]	train-auc:0.96752
[37]	train-auc:0.96761
[38]	train-auc:0.96768
[39]	train-auc:0.96775
[40]	train-auc:0.96779

# Make predictions
y_pred_proba = xgb_model.predict(dtest)
y_pred = (y_pred_proba >= 0.5).astype(int)

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("\nModel Performance Metrics:")
print("-------------------------")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Model Performance Metrics:
-------------------------
Accuracy: 0.9095
Precision: 0.7451
Recall: 0.8340
F1 Score: 0.7870
ROC AUC Score: 0.9615

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.94     73984
           1       0.75      0.83      0.79     18547

    accuracy                           0.91     92531
   macro avg       0.85      0.88      0.86     92531
weighted avg       0.91      0.91      0.91     92531


Confusion Matrix:
[[68691  5293]
 [ 3079 15468]]

# Feature Importance (Optional)
xgb.plot_importance(xgb_model, max_num_features=10)
plt.title('Top 10 Feature Importances')
plt.show()

# Ensure predictions are stored separately
# Assign predictions for Random Forest
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Assign predictions for XGBoost
y_pred_xgb = (y_pred_proba >= 0.5).astype(int)
y_pred_proba_xgb = y_pred_proba  

# Compute evaluation metrics for Random Forest
metrics_rf = {
    'Accuracy': accuracy_score(y_test, y_pred_rf),
    'Precision': precision_score(y_test, y_pred_rf),
    'Recall': recall_score(y_test, y_pred_rf),
    'F1 Score': f1_score(y_test, y_pred_rf),
    'ROC AUC Score': roc_auc_score(y_test, y_pred_proba_rf)
}

# Compute evaluation metrics for XGBoost
metrics_xgb = {
    'Accuracy': accuracy_score(y_test, y_pred_xgb),
    'Precision': precision_score(y_test, y_pred_xgb),
    'Recall': recall_score(y_test, y_pred_xgb),
    'F1 Score': f1_score(y_test, y_pred_xgb),
    'ROC AUC Score': roc_auc_score(y_test, y_pred_proba_xgb)
}

# Create a DataFrame to compare metrics
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'ROC AUC Score'],
    'Random Forest': [metrics_rf['Accuracy'], metrics_rf['Precision'],
                      metrics_rf['Recall'], metrics_rf['F1 Score'],
                      metrics_rf['ROC AUC Score']],
    'XGBoost': [metrics_xgb['Accuracy'], metrics_xgb['Precision'],
               metrics_xgb['Recall'], metrics_xgb['F1 Score'],
               metrics_xgb['ROC AUC Score']]
})

# Plotting ROC Curves
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_proba_xgb)

# Plotting Precision-Recall Curves
precision_rf, recall_rf, _ = precision_recall_curve(y_test, y_pred_proba_rf)
precision_xgb, recall_xgb, _ = precision_recall_curve(y_test, y_pred_proba_xgb)

# Plotting Confusion Matrices
cm_rf = confusion_matrix(y_test, y_pred_rf)
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
# Set up the matplotlib figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(20, 15))

# 1st Row - ROC Curves
axes[0, 0].plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {metrics_rf["ROC AUC Score"]:.2f})')
axes[0, 0].plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {metrics_xgb["ROC AUC Score"]:.2f})')
axes[0, 0].plot([0, 1], [0, 1], 'k--')
axes[0, 0].set_xlabel('False Positive Rate')
axes[0, 0].set_ylabel('True Positive Rate')
axes[0, 0].set_title('ROC Curves')
axes[0, 0].legend(loc='lower right')

# 1st Row - Precision-Recall Curves
axes[0, 1].plot(recall_rf, precision_rf, label='Random Forest')
axes[0, 1].plot(recall_xgb, precision_xgb, label='XGBoost')
axes[0, 1].set_xlabel('Recall')
axes[0, 1].set_ylabel('Precision')
axes[0, 1].set_title('Precision-Recall Curves')
axes[0, 1].legend(loc='lower left')

# 2nd Row - Confusion Matrices
# Random Forest
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', ax=axes[1, 0])
axes[1, 0].set_xlabel('Predicted')
axes[1, 0].set_ylabel('Actual')
axes[1, 0].set_title('Confusion Matrix - Random Forest')

# XGBoost
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Greens', ax=axes[1, 1])
axes[1, 1].set_xlabel('Predicted')
axes[1, 1].set_ylabel('Actual')
axes[1, 1].set_title('Confusion Matrix - XGBoost')

# Add metrics table below the plots
metrics_table = plt.figure(figsize=(12, 2))
table_ax = metrics_table.add_subplot(111)
table_ax.axis('off')
table = table_ax.table(cellText=metrics_df.iloc[:, 1:].values,
                      colLabels=metrics_df.columns[1:],
                      rowLabels=metrics_df['Metric'],
                      loc='center',
                      cellLoc='center')

table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.5, 1.5)
table_ax.set_title('Model Performance Metrics Comparison', fontsize=14)

# Adjust layout
plt.tight_layout()

# Show plots
plt.show()

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

selected_features = [
    'funding_rounds',
    'total_funding',
    'investment_rounds',
    'invested_companies',
    'milestone_count',
    'team_size',
    'office_count',
    'countries_with_offices',
    'times_as_acquirer',
    'total_acquisition_spend',
    'category_code',
    'country_code',
    'has_funding',
    'has_milestones',
    'top_institute'
]

# I decided not to include 'age_category' and 'age' in the model due to a high proportion (80%) of missing values.
# Including these features with such a significant amount of missing data would introduce bias or reduce model performance
# due to imputation or removal of rows. Handling missing data at this scale could lead to inaccurate predictions or overfitting.

# Create kmc_df with the selected features
kmc_df = saved_rfc[selected_features].copy()

# Convert 'has_funding' and 'has_milestones' from 'true/false' to 1/0
binary_mapping = {
    True: 1,
    False: 0
}
kmc_df['has_funding'] = kmc_df['has_funding'].map(binary_mapping)
kmc_df['has_milestones'] = kmc_df['has_milestones'].map(binary_mapping)

# Ensure 'top_institute' is numeric (already 1 or 0)
kmc_df['top_institute'] = pd.to_numeric(kmc_df['top_institute'], errors='coerce')
kmc_df = kmc_df.dropna().reset_index(drop=True)

# ================================
# Step 4: Defining Categorical and Numerical Features
# ================================
categorical_features = ['category_code', 'country_code']
binary_features = ['has_funding', 'has_milestones', 'top_institute']
numerical_features = [
    'funding_rounds',
    'total_funding',
    'investment_rounds',
    'invested_companies',
    'milestone_count',
    'team_size',
    'office_count',
    'countries_with_offices',
    'times_as_acquirer',
    'total_acquisition_spend'
]

kmc_df_cleaned = kmc_df.dropna().reset_index(drop=True)

#  Preprocessing Pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), categorical_features),
        ('scale', StandardScaler(), numerical_features)
    ],
    remainder='passthrough'  # Keep binary features as they are
)

# Apply Preprocessing on the cleaned data
kmc_processed = preprocessor.fit_transform(kmc_df_cleaned)

# Get Feature Names After Encoding
onehot_features = preprocessor.named_transformers_['onehot'].get_feature_names_out(categorical_features)
processed_feature_names = list(onehot_features) + numerical_features + binary_features

# Convert to DataFrame
kmc_df_preprocessed = pd.DataFrame(kmc_processed, columns=processed_feature_names)

# Convert all columns to numeric, coercing errors to NaN
kmc_df_preprocessed = kmc_df_preprocessed.apply(pd.to_numeric, errors='coerce')

# Replace infinite values with NaN and drop them
kmc_df_preprocessed.replace([np.inf, -np.inf], np.nan, inplace=True)
kmc_df_preprocessed.dropna(inplace=True)

pca = PCA(n_components=0.95, random_state=42)  # Retain 95% of the variance
kmc_pca = pca.fit_transform(kmc_df_preprocessed)

print(f"\nNumber of components after PCA: {pca.n_components_}")

Number of components after PCA: 10

# Define the range of clusters to try
cluster_range = range(2, 11)  # K=2 to K=10
inertia = []
silhouette_scores = []

# Sample a subset if the dataset is large to speed up computations
sample_size = 10000
if kmc_pca.shape[0] > sample_size:
    np.random.seed(42)
    sample_indices = np.random.choice(kmc_pca.shape[0], sample_size, replace=False)
    kmc_sample = kmc_pca[sample_indices]
else:
    kmc_sample = kmc_pca

for k in cluster_range:
    kmeans = MiniBatchKMeans(n_clusters=k, random_state=42, batch_size=1000, n_init=10)
    kmeans.fit(kmc_sample)
    inertia.append(kmeans.inertia_)
    try:
        score = silhouette_score(kmc_sample, kmeans.labels_)
    except Exception as e:
        print(f"Error calculating Silhouette Score for K={k}: {e}")
        score = np.nan
    silhouette_scores.append(score)
    print(f"K={k}: Inertia={kmeans.inertia_}, Silhouette Score={score}")

K=2: Inertia=66931.150313527, Silhouette Score=0.6560077783189115
K=3: Inertia=60238.63243525057, Silhouette Score=0.6579750711789613
K=4: Inertia=55569.531256901835, Silhouette Score=0.6619262304630913
K=5: Inertia=55269.281078452565, Silhouette Score=0.19700786907980922
K=6: Inertia=51760.77704151255, Silhouette Score=0.6852864514692218
K=7: Inertia=54474.01506218639, Silhouette Score=0.5186994845975368
K=8: Inertia=49595.05041835685, Silhouette Score=0.6535624784914298
K=9: Inertia=50605.55099369579, Silhouette Score=0.5647622396220242
K=10: Inertia=41552.38159224928, Silhouette Score=0.6921457047304662

plt.figure(figsize=(14, 6))

# Elbow Method Plot
plt.subplot(1, 2, 1)
plt.plot(cluster_range, inertia, marker='o')
plt.title('Elbow Method for Determining Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia (WCSS)')
plt.xticks(cluster_range)
plt.grid(True)

# Silhouette Scores Plot
plt.subplot(1, 2, 2)
plt.plot(cluster_range, silhouette_scores, marker='o', color='orange')
plt.title('Silhouette Scores for Different K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.xticks(cluster_range)
plt.grid(True)

plt.tight_layout()
plt.show()

# Apply K-Means
optimal_k = 4

# Fit K-Means with the optimal number of clusters on the full preprocessed data
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
kmc_df_preprocessed['cluster'] = kmeans_final.fit_predict(kmc_df_preprocessed)

# Also keep the PCA-transformed data for visualization
kmc_df_preprocessed_pca = pd.DataFrame(kmc_pca, columns=[f'PC{i+1}' for i in range(kmc_pca.shape[1])])
kmc_df_preprocessed_pca['cluster'] = kmc_df_preprocessed['cluster']
kmc_df_cleaned['cluster'] = kmc_df_preprocessed['cluster'].values

plt.figure(figsize=(10, 8))
sns.scatterplot(
    x='PC1',
    y='PC2',
    hue='cluster',
    palette='Set1',
    data=kmc_df_preprocessed_pca,
    legend='full',
    alpha=0.6
)
plt.title('Clusters Visualization in PCA Space')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.grid(True)
plt.show()

# Calculate mean values for numerical features per cluster
cluster_profile = kmc_df_cleaned.groupby('cluster')[numerical_features].mean().reset_index()
print("Cluster Profile Summary (Mean Values):")
print(cluster_profile)

# Calculate distribution of categorical features per cluster
for feature in categorical_features:
    cross_tab = pd.crosstab(kmc_df_cleaned[feature], kmc_df_cleaned['cluster'], normalize='index') * 100
    print(f"\nPercentage Distribution of {feature} Across Clusters:")
    print(cross_tab)

Cluster Profile Summary (Mean Values):
   cluster  funding_rounds  total_funding  investment_rounds  \
0        0        0.530692   4.218907e+06           0.486156   
1        1        0.005681   1.706352e+04           0.048032   
2        2        0.769231   3.866269e+08          16.769231   
3        3        0.068493   6.061644e+05         223.643836   

   invested_companies  milestone_count   team_size  office_count  \
0            0.381213         0.655634    2.952702      1.181274   
1            0.044969         0.163526    1.465044      0.000000   
2           13.846154         4.307692  577.384615      6.153846   
3          149.506849         1.986301   45.506849      2.410959   

   countries_with_offices  times_as_acquirer  total_acquisition_spend  
0                1.095815           0.082917             7.892824e+06  
1                0.000000           0.001761             1.410640e+05  
2                3.153846          76.000000             2.382612e+10  
3                1.739726           0.000000             0.000000e+00  

Percentage Distribution of category_code Across Clusters:
cluster                   0          1         2         3
category_code                                             
advertising       71.859626  28.140374  0.000000  0.000000
analytics         86.790607  13.209393  0.000000  0.000000
automotive        55.670103  44.329897  0.000000  0.000000
biotech           94.221219   5.778781  0.000000  0.000000
cleantech         81.443299  18.556701  0.000000  0.000000
consulting        69.996005  30.003995  0.000000  0.000000
design            55.871886  44.128114  0.000000  0.000000
ecommerce         64.368450  35.631550  0.000000  0.000000
education         52.395726  47.604274  0.000000  0.000000
enterprise        75.163252  24.814231  0.022517  0.000000
fashion           67.850799  32.149201  0.000000  0.000000
finance           69.480519  30.375180  0.000000  0.144300
games_video       47.420213  52.579787  0.000000  0.000000
government        83.720930  16.279070  0.000000  0.000000
hardware          74.889868  25.076245  0.033887  0.000000
health            54.416961  45.583039  0.000000  0.000000
hospitality       56.250000  43.750000  0.000000  0.000000
legal             59.189723  40.810277  0.000000  0.000000
local             42.420382  57.579618  0.000000  0.000000
manufacturing     78.676471  21.323529  0.000000  0.000000
medical           70.858630  29.141370  0.000000  0.000000
messaging         85.810811  14.189189  0.000000  0.000000
mobile            68.099679  31.856602  0.043719  0.000000
music             44.922547  55.077453  0.000000  0.000000
nanotech          97.142857   2.857143  0.000000  0.000000
network_hosting   69.872340  30.085106  0.042553  0.000000
news              50.260417  49.739583  0.000000  0.000000
nonprofit         86.413043  13.586957  0.000000  0.000000
other             52.728207  47.264449  0.000000  0.007344
pets              77.049180  22.950820  0.000000  0.000000
photo_video       47.977941  52.022059  0.000000  0.000000
public_relations  71.960647  28.039353  0.000000  0.000000
real_estate       66.455696  33.544304  0.000000  0.000000
search            54.078827  45.875344  0.045830  0.000000
security          75.320239  24.679761  0.000000  0.000000
semiconductor     94.396552   5.603448  0.000000  0.000000
social            61.297710  38.702290  0.000000  0.000000
software          77.457873  22.530968  0.011159  0.000000
sports            33.777778  66.222222  0.000000  0.000000
transportation    39.672802  60.327198  0.000000  0.000000
travel            47.435897  52.564103  0.000000  0.000000
unknown            4.071112  95.908267  0.000000  0.020621
web               62.733166  37.240376  0.026459  0.000000

Percentage Distribution of country_code Across Clusters:
cluster                0          1         2         3
country_code                                           
AFG           100.000000   0.000000  0.000000  0.000000
AGO           100.000000   0.000000  0.000000  0.000000
AIA           100.000000   0.000000  0.000000  0.000000
ALB           100.000000   0.000000  0.000000  0.000000
AND           100.000000   0.000000  0.000000  0.000000
...                  ...        ...       ...       ...
YEM           100.000000   0.000000  0.000000  0.000000
ZAF           100.000000   0.000000  0.000000  0.000000
ZMB           100.000000   0.000000  0.000000  0.000000
ZWE           100.000000   0.000000  0.000000  0.000000
unknown         0.066647  99.932809  0.000272  0.000272

[178 rows x 4 columns]

model	lifelines.CoxPHFitter
duration col	'duration'
event col	'event'
baseline estimation	breslow
number of observations	106935
number of events observed	2591
partial log-likelihood	-29353.83
time fit was run	2024-12-14 03:44:23 UTC

	coef	exp(coef)	se(coef)	coef lower 95%	coef upper 95%	exp(coef) lower 95%	exp(coef) upper 95%	z	p	-log2(p)
funding_total_usd	-0.00	1.00	0.00	-0.00	-0.00	1.00	1.00	-8.31	<0.005	53.24
investment_rounds	-0.43	0.65	0.19	-0.81	-0.05	0.45	0.95	-2.20	0.03	5.18
funding_rounds	0.21	1.23	0.01	0.19	0.23	1.21	1.25	20.45	<0.005	306.50
category_code_analytics	-0.40	0.67	0.24	-0.87	0.07	0.42	1.07	-1.68	0.09	3.41
category_code_automotive	0.49	1.64	0.39	-0.27	1.26	0.76	3.52	1.26	0.21	2.27
category_code_biotech	0.11	1.12	0.14	-0.16	0.38	0.85	1.47	0.78	0.43	1.21
category_code_cleantech	0.50	1.65	0.17	0.18	0.83	1.19	2.29	3.02	<0.005	8.64
category_code_consulting	-0.68	0.51	0.21	-1.09	-0.26	0.34	0.77	-3.20	<0.005	9.50
category_code_design	-0.48	0.62	0.71	-1.88	0.92	0.15	2.51	-0.67	0.50	0.99
category_code_ecommerce	0.35	1.42	0.13	0.10	0.60	1.11	1.82	2.77	0.01	7.46
category_code_education	-1.22	0.30	0.33	-1.87	-0.57	0.15	0.57	-3.67	<0.005	11.99
category_code_enterprise	0.00	1.00	0.15	-0.28	0.29	0.76	1.33	0.03	0.98	0.03
category_code_fashion	0.05	1.05	0.35	-0.63	0.73	0.53	2.08	0.14	0.89	0.17
category_code_finance	0.21	1.24	0.22	-0.23	0.65	0.80	1.92	0.95	0.34	1.55
category_code_games_video	0.92	2.50	0.12	0.68	1.16	1.97	3.18	7.52	<0.005	44.01
category_code_government	-13.45	0.00	1283.81	-2529.68	2502.78	0.00	inf	-0.01	0.99	0.01
category_code_hardware	0.62	1.86	0.15	0.33	0.92	1.39	2.50	4.17	<0.005	14.99
category_code_health	-1.41	0.24	0.46	-2.31	-0.51	0.10	0.60	-3.07	<0.005	8.89
category_code_hospitality	-0.31	0.73	0.42	-1.13	0.51	0.32	1.67	-0.74	0.46	1.12
category_code_legal	-1.08	0.34	0.59	-2.23	0.06	0.11	1.07	-1.85	0.06	3.96
category_code_local	-13.44	0.00	359.43	-717.91	691.02	0.00	1.28e+300	-0.04	0.97	0.04
category_code_manufacturing	-0.96	0.38	0.51	-1.96	0.04	0.14	1.04	-1.88	0.06	4.06
category_code_medical	-0.10	0.91	0.24	-0.56	0.37	0.57	1.44	-0.41	0.68	0.55
category_code_messaging	1.07	2.91	0.23	0.61	1.52	1.85	4.58	4.61	<0.005	17.91
category_code_mobile	0.58	1.78	0.12	0.33	0.82	1.40	2.26	4.69	<0.005	18.47
category_code_music	0.22	1.24	0.33	-0.43	0.87	0.65	2.38	0.66	0.51	0.97
category_code_nanotech	-13.60	0.00	568.78	-1128.38	1101.19	0.00	inf	-0.02	0.98	0.03
category_code_network_hosting	0.61	1.85	0.16	0.30	0.93	1.35	2.53	3.81	<0.005	12.82
category_code_news	0.19	1.21	0.31	-0.41	0.79	0.66	2.20	0.61	0.54	0.89
category_code_nonprofit	-13.65	0.00	488.96	-971.99	944.69	0.00	inf	-0.03	0.98	0.03
category_code_other	-0.10	0.90	0.14	-0.38	0.17	0.69	1.19	-0.73	0.47	1.10
category_code_pets	-13.57	0.00	932.94	-1842.09	1814.96	0.00	inf	-0.01	0.99	0.02
category_code_photo_video	1.06	2.90	0.26	0.56	1.57	1.75	4.79	4.15	<0.005	14.88
category_code_public_relations	0.59	1.80	0.17	0.26	0.91	1.30	2.49	3.55	<0.005	11.34
category_code_real_estate	-0.72	0.49	0.59	-1.87	0.43	0.15	1.54	-1.23	0.22	2.19
category_code_search	0.73	2.08	0.17	0.40	1.06	1.50	2.89	4.37	<0.005	16.33
category_code_security	0.13	1.14	0.23	-0.32	0.57	0.73	1.77	0.56	0.58	0.79
category_code_semiconductor	1.06	2.89	0.17	0.73	1.39	2.08	4.02	6.28	<0.005	31.50
category_code_social	0.97	2.64	0.16	0.65	1.30	1.92	3.65	5.91	<0.005	28.11
category_code_software	0.15	1.16	0.11	-0.07	0.38	0.93	1.46	1.32	0.19	2.42
category_code_sports	0.47	1.60	0.35	-0.21	1.15	0.81	3.17	1.35	0.18	2.51
category_code_transportation	-0.16	0.85	0.51	-1.16	0.84	0.31	2.32	-0.31	0.75	0.41
category_code_travel	0.31	1.36	0.28	-0.23	0.85	0.79	2.34	1.11	0.27	1.92
category_code_web	1.07	2.91	0.11	0.85	1.28	2.35	3.60	9.80	<0.005	72.96

Concordance	0.69
Partial AIC	58795.66
log-likelihood ratio test	1006.26 on 44 df
-log2(p) of ll-ratio test	602.80

Decoding Startup Success: A Data Science Approach to Predicting Venture Outcomes

Table of Contents

Introduction

Defining Startup Success

Part 1: Data Collection

Dataset Overview

Part 2: Data Cleaning

Part 3: Exploratory Data Analysis (EDA)

Part4: Model: Analysis, Hypothesis Testing & ML

Modeling Strategy

Random Forest Classification

XGBoost Classification

K-Means Clustering

Part 5: Conculsion:

Model Performance and Insights

XGBoost

Random Forest

Feature Importance

Clustering Analysis

Cluster Characteristics

Categorical and Regional Distributions

Implications of Clustering

Final Reflections

Key Contributions

Future Directions

References:

1. Data Science Lifecycle Overview

Dataset Source:

2. Machine Learning Algorithms Used:

3.Key Libraries and Tools Used:

4. Helpful Resources (Data Science)

5. Helpful Resources (Startup)

	id	entity_type	entity_id	parent_id	name	normalized_name	permalink	category_code	status	founded_at	...	last_funding_at	funding_rounds	funding_total_usd	first_milestone_at	last_milestone_at	milestones	relationships	created_by	created_at	updated_at
0	c:1	Company	1	NaN	Wetpaint	wetpaint	/company/wetpaint	web	operating	2005-10-17	...	2008-05-19	3	39750000.0	2010-09-05	2013-09-18	5	17	initial-importer	2007-05-25 06:51:27	2013-04-13 03:29:00
1	c:10	Company	10	NaN	Flektor	flektor	/company/flektor	games_video	acquired	NaN	...	NaN	0	0.0	NaN	NaN	0	6	initial-importer	2007-05-31 21:11:51	2008-05-23 23:23:14
2	c:100	Company	100	NaN	There	there	/company/there	games_video	acquired	NaN	...	NaN	0	0.0	2003-02-01	2011-09-23	4	12	initial-importer	2007-08-06 23:52:45	2013-11-04 02:09:48
3	c:10000	Company	10000	NaN	MYWEBBO	mywebbo	/company/mywebbo	network_hosting	operating	2008-07-26	...	NaN	0	0.0	NaN	NaN	0	0	NaN	2008-08-24 16:51:57	2008-09-06 14:19:18
4	c:10001	Company	10001	NaN	THE Movie Streamer	the movie streamer	/company/the-movie-streamer	games_video	operating	2008-07-26	...	NaN	0	0.0	NaN	NaN	0	0	NaN	2008-08-24 17:10:34	2008-09-06 14:19:18

	feature	VIF
0	funding_rounds	1.407169
1	total_funding	1.121879
2	investment_rounds	1.054911
3	milestone_count	1.414665
4	team_size	2.080406
5	times_as_acquirer	2.023916
6	total_acquisition_spend	1.488949
7	countries_with_offices	1.396363
8	top_institute	1.013808

	feature	importance
0	category_code	0.501047
11	countries_with_offices	0.203731
1	country_code	0.179637
6	team_size	0.036557
13	has_milestones	0.024588
5	milestone_count	0.017651
7	has_funding	0.012379
3	total_funding	0.008004
2	funding_rounds	0.007692
4	investment_rounds	0.004858
8	is_investor	0.002054
9	times_as_acquirer	0.000974
10	total_acquisition_spend	0.000478
12	top_institute	0.000349