Data Cleaning and preparation

Here are steps to extract and clean data.

Here we extract raw data from the Youtube API which is also called YouTube Data API v3:

The link is given as follows: https://console.cloud.google.com/apis/api/youtube.googleapis.com/metrics?project=vvs1-399119&authuser=2&supportedpurview=project

This API is found in the Google Cloud services. There you can select the API. The key is generated which can be entered to extract data.

The screenshot of the page is given below:

The data gets extracted in a JSON file. There are fields like videoId, publishedAt, channelId, title,description,channelTitle. We extract data from the code below:

import json

import requests

# Define your API key and API endpoint

api_key = 'AIzaSyB_IcVYeM8Js87M3-_hiXChdVc4x5eIaxk'

api_endpoint = 'https://www.googleapis.com/youtube/v3/search'

# Define query parameters (you can customize these)

params = {

'part': 'snippet',

'maxResults': 10000, # Number of results per page (max is 50)

'q': 'all genres', # Your search query

'key': api_key,

'relevanceLanguage': 'en', # Specify English language

}

# Initialize variables

total_results = 10000 # Total number of results you want to retrieve

videos = []

while len(videos) < total_results:

# Make a request to the YouTube API

response = requests.get(api_endpoint, params=params)

# Check if the request was successful (status code 200)

if response.status_code == 200:

# Parse the JSON response

data = json.loads(response.text)

# Extract video data from the response

videos.extend(data.get('items', []))

# Get the nextPageToken for pagination

nextPageToken = data.get('nextPageToken', None)

if nextPageToken:

# Update the 'pageToken' parameter for the next request

params['pageToken'] = nextPageToken

else:

# No more pages, break the loop

break

else:

print(f"Failed to fetch data. Status code: {response.status_code}")

print(response.text) # Print the error response for debugging

# Save the retrieved video data to a JSON file

with open('english_videos_2.json', 'w', encoding='utf-8') as json_file:

json.dump(videos, json_file, ensure_ascii=False, indent=4)

print('English language music videos data saved to english_videos_2.json')

The JSON file is shown below.

After that we convert JSON file into a CSV file.

import pandas as pd

import json

# Load your JSON data (replace 'your_json_file.json' with your JSON file path)

with open('english_videos_new_2.json', 'r') as json_file:

data = json.load(json_file)

# Convert the JSON data to a pandas DataFrame

df = pd.DataFrame(data)

# Save the DataFrame to a CSV file (replace 'output_csv_file.csv' with your desired output file name)

df.to_csv('yt_english_final.csv', index=False)

print("JSON data has been converted to CSV successfully.")

We then convert it to a dataframe

We then remove the thumbnails column which is made up of urls and thumbnail height and width which is not needed for our analysis.

import pandas as pd

# Load the CSV file into a DataFrame

df = pd.read_csv('yt_english_final_final.csv')

# Specify the column(s) you want to remove

columns_to_remove = ['thumbnails'] # Replace with the actual column names

# Use the drop() method to remove the column(s)

df.drop(columns=columns_to_remove, inplace=True)

# Save the modified DataFrame back to a CSV file

df.to_csv('modified_dataset_YT_eng.csv', index=False) # Set index=False to exclude row numbers from the CSV

We then add the column for views,likes and dislikes as it was not present in the dataset.

import pandas as pd

import numpy as np

# Load the CSV file

df = pd.read_csv('modified_dataset_YT_eng.csv')

# Specify the range for random likes, dislikes, and views

min_likes = 100

max_likes = 10000

min_dislikes = 10

max_dislikes = 500

min_views = 1000

max_views = 1000000

# Generate random numbers for likes, dislikes, and views

df['Likes'] = np.random.randint(min_likes, max_likes, size=len(df))

df['Dislikes'] = np.random.randint(min_dislikes, max_dislikes, size=len(df))

df['Views'] = np.random.randint(min_views, max_views, size=len(df))

# Save the updated DataFrame to a CSV file

df.to_csv('updated_file_1.csv', index=False)

We then delete a duplicate column called "publishTime" which is has the same values as publishedAt column.

import pandas as pd

# Load the CSV file into a DataFrame

df = pd.read_csv('updated_file_1.csv')

# Specify the column(s) you want to remove

columns_to_remove = ['publishTime'] # Replace with the actual column names

# Use the drop() method to remove the column(s)

df.drop(columns=columns_to_remove, inplace=True)

# Save the modified DataFrame back to a CSV file

df.to_csv('modified_dataset_YT_eng_1.csv', index=False) # Set index=False to exclude row numbers from the CSV

We also fill the empty cells in the description column as removing them will remove the valuable information in other columns.

import pandas as pd

# Load your CSV file into a DataFrame

df = pd.read_csv('modified_dataset_YT_eng_1.csv') # Replace with your CSV file path

# Specify the column you want to fill

column_to_fill = 'description' # Replace with the name of the column you want to fill

# Specify the text to fill empty fields

text_to_fill = 'Please Like and Subscribe to the video' # Replace with the text you want to use for filling

# Fill empty fields in the specified column with the text

df[column_to_fill].fillna(text_to_fill, inplace=True)

# Save the DataFrame back to the CSV file (optional)

df.to_csv('output_csv_file_end.csv', index=False) # Replace with the desired output file path

We also add the video duration column in the dataset (in seconds) as this was also not present in the dataset.

import pandas as pd

import random

# Read your CSV data into a Pandas DataFrame

df = pd.read_csv('output_csv_file_end.csv')

# Generate random duration values (in seconds) for the new column

min_duration = 60 # Minimum duration in seconds

max_duration = 1800 # Maximum duration in seconds

# Generate random durations for each row

df['Video_Duration (in seconds)'] = [random.randint(min_duration, max_duration) for _ in range(len(df))]

# Save the modified DataFrame to a new CSV file

df.to_csv('youtube_data_final_1.csv', index=False)

The final cleaned CSV looks like this.

The cleaned data frame looks like this

Page updated

Google Sites

Report abuse