TikTok Project

Inspect and analyze data

The purpose of this project is to investigate and understand the data provided.

Imports and data loading

# Import packages
import pandas as pd
import numpy as np

Loading the dataset into a dataframe

data = pd.read_csv("tiktok_dataset.csv")

Inspect the data

# Display and examine the first 10 rows of the dataframe

data.head(10)

	#	claim_status	video_id	video_duration_sec	video_transcription_text	verified_status	author_ban_status	video_view_count	video_like_count	video_share_count	video_download_count	video_comment_count
0	1	claim	7017666017	59	someone shared with me that drone deliveries a...	not verified	under review	343296.0	19425.0	241.0	1.0	0.0
1	2	claim	4014381136	32	someone shared with me that there are more mic...	not verified	active	140877.0	77355.0	19034.0	1161.0	684.0
2	3	claim	9859838091	31	someone shared with me that american industria...	not verified	active	902185.0	97690.0	2858.0	833.0	329.0
3	4	claim	1866847991	25	someone shared with me that the metro of st. p...	not verified	active	437506.0	239954.0	34812.0	1234.0	584.0
4	5	claim	7105231098	19	someone shared with me that the number of busi...	not verified	active	56167.0	34987.0	4110.0	547.0	152.0
5	6	claim	8972200955	35	someone shared with me that gross domestic pro...	not verified	under review	336647.0	175546.0	62303.0	4293.0	1857.0
6	7	claim	4958886992	16	someone shared with me that elvis presley has ...	not verified	active	750345.0	486192.0	193911.0	8616.0	5446.0
7	8	claim	2270982263	41	someone shared with me that the best selling s...	not verified	active	547532.0	1072.0	50.0	22.0	11.0
8	9	claim	5235769692	50	someone shared with me that about half of the ...	not verified	active	24819.0	10160.0	1050.0	53.0	27.0
9	10	claim	4660861094	45	someone shared with me that it would take a 50...	verified	active	931587.0	171051.0	67739.0	4104.0	2540.0

# Get summary info

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19382 entries, 0 to 19381
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   #                         19382 non-null  int64  
 1   claim_status              19084 non-null  object 
 2   video_id                  19382 non-null  int64  
 3   video_duration_sec        19382 non-null  int64  
 4   video_transcription_text  19084 non-null  object 
 5   verified_status           19382 non-null  object 
 6   author_ban_status         19382 non-null  object 
 7   video_view_count          19084 non-null  float64
 8   video_like_count          19084 non-null  float64
 9   video_share_count         19084 non-null  float64
 10  video_download_count      19084 non-null  float64
 11  video_comment_count       19084 non-null  float64
dtypes: float64(5), int64(3), object(4)
memory usage: 1.8+ MB

# Get summary statistics

data.describe()

	#	video_id	video_duration_sec	video_view_count	video_like_count	video_share_count	video_download_count	video_comment_count
count	19382.000000	1.938200e+04	19382.000000	19084.000000	19084.000000	19084.000000	19084.000000	19084.000000
mean	9691.500000	5.627454e+09	32.421732	254708.558688	84304.636030	16735.248323	1049.429627	349.312146
std	5595.245794	2.536440e+09	16.229967	322893.280814	133420.546814	32036.174350	2004.299894	799.638865
min	1.000000	1.234959e+09	5.000000	20.000000	0.000000	0.000000	0.000000	0.000000
25%	4846.250000	3.430417e+09	18.000000	4942.500000	810.750000	115.000000	7.000000	1.000000
50%	9691.500000	5.618664e+09	32.000000	9954.500000	3403.500000	717.000000	46.000000	9.000000
75%	14536.750000	7.843960e+09	47.000000	504327.000000	125020.000000	18222.000000	1156.250000	292.000000
max	19382.000000	9.999873e+09	60.000000	999817.000000	657830.000000	256130.000000	14994.000000	9599.000000

Investigate the variables

# What are the different values for claim status and how many of each are in the data?

data['claim_status'].value_counts()

claim      9608
opinion    9476
Name: claim_status, dtype: int64

# What is the average view count of videos with "claim" status?

claims = data[data['claim_status'] == 'claim']
print('Mean view count claims:', claims['video_view_count'].mean())
print('Median view count claims:', claims['video_view_count'].median())

Mean view count claims: 501029.4527477102
Median view count claims: 501555.0

# What is the average view count of videos with "opinion" status?

opinions = data[data['claim_status'] == 'opinion']
print('Mean view count opinions:', opinions['video_view_count'].mean())
print('Median view count opinions:', opinions['video_view_count'].median())

Mean view count opinions: 4956.43224989447
Median view count opinions: 4953.0

# Get counts for each group combination of claim status and author ban status

data.groupby(['claim_status', 'author_ban_status']).count()[['#']]

		#
claim_status	author_ban_status
claim	active	6566
	banned	1439
	under review	1603
opinion	active	8817
	banned	196
	under review	463


data.groupby(['author_ban_status']).agg(
    {'video_view_count': ['mean', 'median'],
     'video_like_count': ['mean', 'median'],
     'video_share_count': ['mean', 'median']})

	video_view_count		video_like_count		video_share_count
	mean	median	mean	median	mean	median
author_ban_status
active	215927.039524	8616.0	71036.533836	2222.0	14111.466164	437.0
banned	445845.439144	448201.0	153017.236697	105573.0	29998.942508	14468.0
under review	392204.836399	365245.5	128718.050339	71204.5	25774.696999	9444.0

# What's the median video share count of each author ban status?

data.groupby(['author_ban_status']).median(numeric_only=True)[
    ['video_share_count']]

	video_share_count
author_ban_status
active	437.0
banned	14468.0
under review	9444.0


data.groupby(['author_ban_status']).agg(
    {'video_view_count': ['count', 'mean', 'median'],
     'video_like_count': ['count', 'mean', 'median'],
     'video_share_count': ['count', 'mean', 'median']
     })

	video_view_count			video_like_count			video_share_count
	count	mean	median	count	mean	median	count	mean	median
author_ban_status
active	15383	215927.039524	8616.0	15383	71036.533836	2222.0	15383	14111.466164	437.0
banned	1635	445845.439144	448201.0	1635	153017.236697	105573.0	1635	29998.942508	14468.0
under review	2066	392204.836399	365245.5	2066	128718.050339	71204.5	2066	25774.696999	9444.0

# Create a likes_per_view column
data['likes_per_view'] = data['video_like_count'] / data['video_view_count']

# Create a comments_per_view column
data['comments_per_view'] = data['video_comment_count'] / data['video_view_count']

# Create a shares_per_view column
data['shares_per_view'] = data['video_share_count'] / data['video_view_count']

data.groupby(['claim_status', 'author_ban_status']).agg(
    {'likes_per_view': ['count', 'mean', 'median'],
     'comments_per_view': ['count', 'mean', 'median'],
     'shares_per_view': ['count', 'mean', 'median']})

		likes_per_view			comments_per_view			shares_per_view
		count	mean	median	count	mean	median	count	mean	median
claim_status	author_ban_status
claim	active	6566	0.329542	0.326538	6566	0.001393	0.000776	6566	0.065456	0.049279
	banned	1439	0.345071	0.358909	1439	0.001377	0.000746	1439	0.067893	0.051606
	under review	1603	0.327997	0.320867	1603	0.001367	0.000789	1603	0.065733	0.049967
opinion	active	8817	0.219744	0.218330	8817	0.000517	0.000252	8817	0.043729	0.032405
	banned	196	0.206868	0.198483	196	0.000434	0.000193	196	0.040531	0.030728
	under review	463	0.226394	0.228051	463	0.000536	0.000293	463	0.044472	0.035027

Around 50% are claims, 9,608 total
Engagement has a high level of correlation with claim status
Videos with banned authors have higher engagement than videos with active authors