TikTok Project
Inspect and analyze data
The purpose of this project is to investigate and understand the data provided.
Imports and data loading
# Import packages
import pandas as pd
import numpy as np
Loading the dataset into a dataframe
data = pd.read_csv("tiktok_dataset.csv")
Inspect the data
# Display and examine the first 10 rows of the dataframe
data.head(10)
|
# |
claim_status |
video_id |
video_duration_sec |
video_transcription_text |
verified_status |
author_ban_status |
video_view_count |
video_like_count |
video_share_count |
video_download_count |
video_comment_count |
| 0 |
1 |
claim |
7017666017 |
59 |
someone shared with me that drone deliveries a... |
not verified |
under review |
343296.0 |
19425.0 |
241.0 |
1.0 |
0.0 |
| 1 |
2 |
claim |
4014381136 |
32 |
someone shared with me that there are more mic... |
not verified |
active |
140877.0 |
77355.0 |
19034.0 |
1161.0 |
684.0 |
| 2 |
3 |
claim |
9859838091 |
31 |
someone shared with me that american industria... |
not verified |
active |
902185.0 |
97690.0 |
2858.0 |
833.0 |
329.0 |
| 3 |
4 |
claim |
1866847991 |
25 |
someone shared with me that the metro of st. p... |
not verified |
active |
437506.0 |
239954.0 |
34812.0 |
1234.0 |
584.0 |
| 4 |
5 |
claim |
7105231098 |
19 |
someone shared with me that the number of busi... |
not verified |
active |
56167.0 |
34987.0 |
4110.0 |
547.0 |
152.0 |
| 5 |
6 |
claim |
8972200955 |
35 |
someone shared with me that gross domestic pro... |
not verified |
under review |
336647.0 |
175546.0 |
62303.0 |
4293.0 |
1857.0 |
| 6 |
7 |
claim |
4958886992 |
16 |
someone shared with me that elvis presley has ... |
not verified |
active |
750345.0 |
486192.0 |
193911.0 |
8616.0 |
5446.0 |
| 7 |
8 |
claim |
2270982263 |
41 |
someone shared with me that the best selling s... |
not verified |
active |
547532.0 |
1072.0 |
50.0 |
22.0 |
11.0 |
| 8 |
9 |
claim |
5235769692 |
50 |
someone shared with me that about half of the ... |
not verified |
active |
24819.0 |
10160.0 |
1050.0 |
53.0 |
27.0 |
| 9 |
10 |
claim |
4660861094 |
45 |
someone shared with me that it would take a 50... |
verified |
active |
931587.0 |
171051.0 |
67739.0 |
4104.0 |
2540.0 |
# Get summary info
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19382 entries, 0 to 19381
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 # 19382 non-null int64
1 claim_status 19084 non-null object
2 video_id 19382 non-null int64
3 video_duration_sec 19382 non-null int64
4 video_transcription_text 19084 non-null object
5 verified_status 19382 non-null object
6 author_ban_status 19382 non-null object
7 video_view_count 19084 non-null float64
8 video_like_count 19084 non-null float64
9 video_share_count 19084 non-null float64
10 video_download_count 19084 non-null float64
11 video_comment_count 19084 non-null float64
dtypes: float64(5), int64(3), object(4)
memory usage: 1.8+ MB
# Get summary statistics
data.describe()
|
# |
video_id |
video_duration_sec |
video_view_count |
video_like_count |
video_share_count |
video_download_count |
video_comment_count |
| count |
19382.000000 |
1.938200e+04 |
19382.000000 |
19084.000000 |
19084.000000 |
19084.000000 |
19084.000000 |
19084.000000 |
| mean |
9691.500000 |
5.627454e+09 |
32.421732 |
254708.558688 |
84304.636030 |
16735.248323 |
1049.429627 |
349.312146 |
| std |
5595.245794 |
2.536440e+09 |
16.229967 |
322893.280814 |
133420.546814 |
32036.174350 |
2004.299894 |
799.638865 |
| min |
1.000000 |
1.234959e+09 |
5.000000 |
20.000000 |
0.000000 |
0.000000 |
0.000000 |
0.000000 |
| 25% |
4846.250000 |
3.430417e+09 |
18.000000 |
4942.500000 |
810.750000 |
115.000000 |
7.000000 |
1.000000 |
| 50% |
9691.500000 |
5.618664e+09 |
32.000000 |
9954.500000 |
3403.500000 |
717.000000 |
46.000000 |
9.000000 |
| 75% |
14536.750000 |
7.843960e+09 |
47.000000 |
504327.000000 |
125020.000000 |
18222.000000 |
1156.250000 |
292.000000 |
| max |
19382.000000 |
9.999873e+09 |
60.000000 |
999817.000000 |
657830.000000 |
256130.000000 |
14994.000000 |
9599.000000 |
Investigate the variables
# What are the different values for claim status and how many of each are in the data?
data['claim_status'].value_counts()
claim 9608
opinion 9476
Name: claim_status, dtype: int64
# What is the average view count of videos with "claim" status?
claims = data[data['claim_status'] == 'claim']
print('Mean view count claims:', claims['video_view_count'].mean())
print('Median view count claims:', claims['video_view_count'].median())
Mean view count claims: 501029.4527477102
Median view count claims: 501555.0
# What is the average view count of videos with "opinion" status?
opinions = data[data['claim_status'] == 'opinion']
print('Mean view count opinions:', opinions['video_view_count'].mean())
print('Median view count opinions:', opinions['video_view_count'].median())
Mean view count opinions: 4956.43224989447
Median view count opinions: 4953.0
# Get counts for each group combination of claim status and author ban status
data.groupby(['claim_status', 'author_ban_status']).count()[['#']]
|
|
# |
| claim_status |
author_ban_status |
|
| claim |
active |
6566 |
| banned |
1439 |
| under review |
1603 |
| opinion |
active |
8817 |
| banned |
196 |
| under review |
463 |
data.groupby(['author_ban_status']).agg(
{'video_view_count': ['mean', 'median'],
'video_like_count': ['mean', 'median'],
'video_share_count': ['mean', 'median']})
|
video_view_count |
video_like_count |
video_share_count |
|
mean |
median |
mean |
median |
mean |
median |
| author_ban_status |
|
|
|
|
|
|
| active |
215927.039524 |
8616.0 |
71036.533836 |
2222.0 |
14111.466164 |
437.0 |
| banned |
445845.439144 |
448201.0 |
153017.236697 |
105573.0 |
29998.942508 |
14468.0 |
| under review |
392204.836399 |
365245.5 |
128718.050339 |
71204.5 |
25774.696999 |
9444.0 |
# What's the median video share count of each author ban status?
data.groupby(['author_ban_status']).median(numeric_only=True)[
['video_share_count']]
|
video_share_count |
| author_ban_status |
|
| active |
437.0 |
| banned |
14468.0 |
| under review |
9444.0 |
data.groupby(['author_ban_status']).agg(
{'video_view_count': ['count', 'mean', 'median'],
'video_like_count': ['count', 'mean', 'median'],
'video_share_count': ['count', 'mean', 'median']
})
|
video_view_count |
video_like_count |
video_share_count |
|
count |
mean |
median |
count |
mean |
median |
count |
mean |
median |
| author_ban_status |
|
|
|
|
|
|
|
|
|
| active |
15383 |
215927.039524 |
8616.0 |
15383 |
71036.533836 |
2222.0 |
15383 |
14111.466164 |
437.0 |
| banned |
1635 |
445845.439144 |
448201.0 |
1635 |
153017.236697 |
105573.0 |
1635 |
29998.942508 |
14468.0 |
| under review |
2066 |
392204.836399 |
365245.5 |
2066 |
128718.050339 |
71204.5 |
2066 |
25774.696999 |
9444.0 |
# Create a likes_per_view column
data['likes_per_view'] = data['video_like_count'] / data['video_view_count']
# Create a comments_per_view column
data['comments_per_view'] = data['video_comment_count'] / data['video_view_count']
# Create a shares_per_view column
data['shares_per_view'] = data['video_share_count'] / data['video_view_count']
data.groupby(['claim_status', 'author_ban_status']).agg(
{'likes_per_view': ['count', 'mean', 'median'],
'comments_per_view': ['count', 'mean', 'median'],
'shares_per_view': ['count', 'mean', 'median']})
|
|
likes_per_view |
comments_per_view |
shares_per_view |
|
|
count |
mean |
median |
count |
mean |
median |
count |
mean |
median |
| claim_status |
author_ban_status |
|
|
|
|
|
|
|
|
|
| claim |
active |
6566 |
0.329542 |
0.326538 |
6566 |
0.001393 |
0.000776 |
6566 |
0.065456 |
0.049279 |
| banned |
1439 |
0.345071 |
0.358909 |
1439 |
0.001377 |
0.000746 |
1439 |
0.067893 |
0.051606 |
| under review |
1603 |
0.327997 |
0.320867 |
1603 |
0.001367 |
0.000789 |
1603 |
0.065733 |
0.049967 |
| opinion |
active |
8817 |
0.219744 |
0.218330 |
8817 |
0.000517 |
0.000252 |
8817 |
0.043729 |
0.032405 |
| banned |
196 |
0.206868 |
0.198483 |
196 |
0.000434 |
0.000193 |
196 |
0.040531 |
0.030728 |
| under review |
463 |
0.226394 |
0.228051 |
463 |
0.000536 |
0.000293 |
463 |
0.044472 |
0.035027 |
- Around 50% are claims, 9,608 total
- Engagement has a high level of correlation with claim status
- Videos with banned authors have higher engagement than videos with active authors