英文:
Dataframe is Being Returned as Empty
问题
我是中文翻译,以下是你要翻译的代码部分:
import pandas as pd
import numpy as np
# Google API
from googleapiclient.discovery import build
api_key = '<api key>'
channel_ids = ['<channel_id>', ]
youtube = build('youtube', 'v3', developerKey=api_key)
def get_channel_stats(youtube, channel_ids):
"""
获取频道统计信息:标题,订阅者数量,观看次数,视频数量,上传播放列表
参数:
youtube:来自googleapiclient.discovery的构建对象
channels_ids:频道ID列表
返回:
包含提供列表中所有频道的频道统计信息的数据框:标题,订阅者数量,观看次数,视频数量,上传播放列表
"""
all_data = []
request = youtube.channels().list(
part='snippet,contentDetails,statistics',
id=','.join(channel_ids))
response = request.execute()
for i in range(len(response['items'])):
data = dict(channelName=response['items'][i]['snippet']['title'],
subscribers=response['items'][i]['statistics']['subscriberCount'],
views=response['items'][i]['statistics']['viewCount'],
totalVideos=response['items'][i]['statistics']['videoCount'],
playlistId=response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
all_data.append(data)
return pd.DataFrame(all_data)
def get_video_ids(youtube, playlist_id):
"""
获取给定播放列表中所有视频的视频ID列表
参数:
youtube:来自googleapiclient.discovery的构建对象
playlist_id:频道的播放列表ID
返回:
播放列表中所有视频的视频ID列表
"""
request = youtube.playlistItems().list(
part='contentDetails',
playlistId=playlist_id,
maxResults=50)
response = request.execute()
video_ids = []
for i in range(len(response['items'])):
video_ids.append(response['items'][i]['contentDetails']['videoId'])
next_page_token = response.get('nextPageToken')
more_pages = True
while more_pages:
if next_page_token is None:
more_pages = False
else:
request = youtube.playlistItems().list(
part='contentDetails',
playlistId=playlist_id,
maxResults=50,
pageToken=next_page_token)
response = request.execute()
for i in range(len(response['items'])):
video_ids.append(response['items'][i]['contentDetails']['videoId'])
next_page_token = response.get('nextPageToken')
return video_ids
def get_video_details(youtube, video_ids):
"""
获取具有给定ID的所有视频的视频统计信息
参数:
youtube:来自googleapiclient.discovery的构建对象
video_ids:视频ID列表
返回:
具有视频统计信息的数据帧,即:
'channelTitle','title','description','tags','publishedAt'
'viewCount','likeCount','favoriteCount','commentCount'
'duration','definition','caption'
"""
all_video_info = []
for i in range(0, len(video_ids), 50):
request = youtube.videos().list(
part="snippet,contentDetails,statistics",
id=','.join(video_ids[i:i + 50])
)
response = request.execute()
for video in response['items']:
stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
'contentDetails': ['duration', 'definition', 'caption']
}
video_info = {}
video_info['video_id'] = video['id']
for k in stats_to_keep.keys():
for v in stats_to_keep[k]:
try:
video_info[v] = video[k][v]
except:
video_info[v] = None
all_video_info.append(video_info)
def get_comments_in_videos(youtube, video_ids):
"""
获取所有具有给定ID的视频的文本格式的顶级评论(仅由于Youtube API的引用限制,获取前10条评论)
参数:
youtube:来自googleapiclient.discovery的构建对象
video_ids:视频ID列表
返回:
具有视频ID和相关的顶级评论文本的数据帧。
"""
all_comments = []
for video_id in video_ids:
try:
request = youtube.commentThreads().list(
part="snippet,replies",
videoId=video_id
)
response = request.execute()
comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in
response['items'][0:10]]
comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}
all_comments.append(comments_in_video_info)
except:
# 当出现错误时 - 最有可能是因为在视频上禁用了评论
print('无法获取视频的评论:' + video_id)
return pd.DataFrame(all_comments)
channel_data = get_channel_stats(youtube, channel_ids)
# 将计数列转换为数值列
numeric_cols = ['subscribers', 'views', 'totalVideos']
channel_data[numeric_cols] = channel_data[numeric_cols].apply(pd.to_numeric, errors='coerce')
# 创建包含所有频道的视频统计和评论数据的数据帧
video_df = pd.DataFrame()
comments_df = pd.DataFrame()
for c in channel_data['channelName'].unique():
print("从频道获取视频信息:" + c)
playlist_id = channel_data.loc[channel_data['channelName'] == c, 'playlistId'].iloc[0]
video_ids = get_video_ids(youtube, playlist_id)
# 获取视频数据
video_data = get_video_details(youtube, video_ids)
# 获取评论数据
comments_data = get_comments_in_videos(youtube, video_ids)
# 将视频数据和评论数据合并在一起
video_df = video_df.append(video_data, ignore_index=True)
comments_df = comments_df.append(comments_data, ignore_index=True)
print(video_df)
非空的video_df数据帧和数据导出到Excel。
英文:
I am new to Python and am trying to learn it to scrape data off of YouTube. I got this program from a very good YouTube tutorial video; it isn't showing any errors but is returning an empty dataframe for video_df. Looking for advice as to why this is happening and what I can do to correct this?
import pandas as pd
import numpy as np
# Google API
from googleapiclient.discovery import build
api_key = '<api key>'
channel_ids = ['<channel_id>',
]
youtube = build('youtube', 'v3', developerKey=api_key)
def get_channel_stats(youtube, channel_ids):
"""
Get channel statistics: title, subscriber count, view count, video count, upload playlist
Params:
youtube: the build object from googleapiclient.discovery
channels_ids: list of channel IDs
Returns:
Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
"""
all_data = []
request = youtube.channels().list(
part='snippet,contentDetails,statistics',
id=','.join(channel_ids))
response = request.execute()
for i in range(len(response['items'])):
data = dict(channelName=response['items'][i]['snippet']['title'],
subscribers=response['items'][i]['statistics']['subscriberCount'],
views=response['items'][i]['statistics']['viewCount'],
totalVideos=response['items'][i]['statistics']['videoCount'],
playlistId=response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
all_data.append(data)
return pd.DataFrame(all_data)
def get_video_ids(youtube, playlist_id):
"""
Get list of video IDs of all videos in the given playlist
Params:
youtube: the build object from googleapiclient.discovery
playlist_id: playlist ID of the channel
Returns:
List of video IDs of all videos in the playlist
"""
request = youtube.playlistItems().list(
part='contentDetails',
playlistId=playlist_id,
maxResults=50)
response = request.execute()
video_ids = []
for i in range(len(response['items'])):
video_ids.append(response['items'][i]['contentDetails']['videoId'])
next_page_token = response.get('nextPageToken')
more_pages = True
while more_pages:
if next_page_token is None:
more_pages = False
else:
request = youtube.playlistItems().list(
part='contentDetails',
playlistId=playlist_id,
maxResults=50,
pageToken=next_page_token)
response = request.execute()
for i in range(len(response['items'])):
video_ids.append(response['items'][i]['contentDetails']['videoId'])
next_page_token = response.get('nextPageToken')
return video_ids
def get_video_details(youtube, video_ids):
"""
Get video statistics of all videos with given IDs
Params:
youtube: the build object from googleapiclient.discovery
video_ids: list of video IDs
Returns:
Dataframe with statistics of videos, i.e.:
'channelTitle', 'title', 'description', 'tags', 'publishedAt'
'viewCount', 'likeCount', 'favoriteCount', 'commentCount'
'duration', 'definition', 'caption'
"""
all_video_info = []
for i in range(0, len(video_ids), 50):
request = youtube.videos().list(
part="snippet,contentDetails,statistics",
id=','.join(video_ids[i:i + 50])
)
response = request.execute()
for video in response['items']:
stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
'contentDetails': ['duration', 'definition', 'caption']
}
video_info = {}
video_info['video_id'] = video['id']
for k in stats_to_keep.keys():
for v in stats_to_keep[k]:
try:
video_info[v] = video[k][v]
except:
video_info[v] = None
all_video_info.append(video_info)
def get_comments_in_videos(youtube, video_ids):
"""
Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
Params:
youtube: the build object from googleapiclient.discovery
video_ids: list of video IDs
Returns:
Dataframe with video IDs and associated top level comment in text.
"""
all_comments = []
for video_id in video_ids:
try:
request = youtube.commentThreads().list(
part="snippet,replies",
videoId=video_id
)
response = request.execute()
comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in
response['items'][0:10]]
comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}
all_comments.append(comments_in_video_info)
except:
# When error occurs - most likely because comments are disabled on a video
print('Could not get comments for video ' + video_id)
return pd.DataFrame(all_comments)
channel_data = get_channel_stats(youtube, channel_ids)
# Convert count columns to numeric columns
numeric_cols = ['subscribers', 'views', 'totalVideos']
channel_data[numeric_cols] = channel_data[numeric_cols].apply(pd.to_numeric, errors='coerce')
# Create a dataframe with video statistics and comments from all channels
video_df = pd.DataFrame()
comments_df = pd.DataFrame()
for c in channel_data['channelName'].unique():
print("Getting video information from channel: " + c)
playlist_id = channel_data.loc[channel_data['channelName'] == c, 'playlistId'].iloc[0]
video_ids = get_video_ids(youtube, playlist_id)
# get video data
video_data = get_video_details(youtube, video_ids)
# get comment data
comments_data = get_comments_in_videos(youtube, video_ids)
# append video data together and comment data toghether
video_df = video_df.append(video_data, ignore_index=True)
comments_df = comments_df.append(comments_data, ignore_index=True)
print(video_df)
Non-empty dataframe for video_df and data export to excel
答案1
得分: 1
你在函数 get_video_details()
中缺少了一个返回语句。
我猜你想在函数末尾添加 return pd.DataFrame(all_video_info)
。
英文:
You miss a return statement in your function get_video_details()
.
I guess you want to add return pd.DataFrame(all_video_info)
to the end of the function.
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论