Dataframe is Being Returned as Empty

huangapple go评论67阅读模式
英文:

Dataframe is Being Returned as Empty

问题

我是中文翻译,以下是你要翻译的代码部分:

import pandas as pd
import numpy as np

# Google API
from googleapiclient.discovery import build

api_key = '<api key>'
channel_ids = ['<channel_id>', ]

youtube = build('youtube', 'v3', developerKey=api_key)


def get_channel_stats(youtube, channel_ids):
    """
    获取频道统计信息:标题,订阅者数量,观看次数,视频数量,上传播放列表
    参数:

    youtube:来自googleapiclient.discovery的构建对象
    channels_ids:频道ID列表

    返回:
    包含提供列表中所有频道的频道统计信息的数据框:标题,订阅者数量,观看次数,视频数量,上传播放列表

    """

    all_data = []
    request = youtube.channels().list(
        part='snippet,contentDetails,statistics',
        id=','.join(channel_ids))
    response = request.execute()

    for i in range(len(response['items'])):
        data = dict(channelName=response['items'][i]['snippet']['title'],
                    subscribers=response['items'][i]['statistics']['subscriberCount'],
                    views=response['items'][i]['statistics']['viewCount'],
                    totalVideos=response['items'][i]['statistics']['videoCount'],
                    playlistId=response['items'][i]['contentDetails']['relatedPlaylists']['uploads'])
        all_data.append(data)

    return pd.DataFrame(all_data)


def get_video_ids(youtube, playlist_id):
    """
    获取给定播放列表中所有视频的视频ID列表
    参数:

    youtube:来自googleapiclient.discovery的构建对象
    playlist_id:频道的播放列表ID

    返回:
    播放列表中所有视频的视频ID列表

    """

    request = youtube.playlistItems().list(
        part='contentDetails',
        playlistId=playlist_id,
        maxResults=50)
    response = request.execute()

    video_ids = []

    for i in range(len(response['items'])):
        video_ids.append(response['items'][i]['contentDetails']['videoId'])

    next_page_token = response.get('nextPageToken')
    more_pages = True

    while more_pages:
        if next_page_token is None:
            more_pages = False
        else:
            request = youtube.playlistItems().list(
                part='contentDetails',
                playlistId=playlist_id,
                maxResults=50,
                pageToken=next_page_token)
            response = request.execute()

            for i in range(len(response['items'])):
                video_ids.append(response['items'][i]['contentDetails']['videoId'])

            next_page_token = response.get('nextPageToken')

    return video_ids


def get_video_details(youtube, video_ids):
    """
    获取具有给定ID的所有视频的视频统计信息
    参数:

    youtube:来自googleapiclient.discovery的构建对象
    video_ids:视频ID列表

    返回:
    具有视频统计信息的数据帧,即:
    'channelTitle','title','description','tags','publishedAt'
    'viewCount','likeCount','favoriteCount','commentCount'
    'duration','definition','caption'

    """

    all_video_info = []

    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part="snippet,contentDetails,statistics",
            id=','.join(video_ids[i:i + 50])
        )
        response = request.execute()

        for video in response['items']:
            stats_to_keep = {'snippet': ['channelTitle', 'title', 'description', 'tags', 'publishedAt'],
                             'statistics': ['viewCount', 'likeCount', 'favouriteCount', 'commentCount'],
                             'contentDetails': ['duration', 'definition', 'caption']
                             }
            video_info = {}
            video_info['video_id'] = video['id']

            for k in stats_to_keep.keys():
                for v in stats_to_keep[k]:
                    try:
                        video_info[v] = video[k][v]
                    except:
                        video_info[v] = None

            all_video_info.append(video_info)


def get_comments_in_videos(youtube, video_ids):
    """
    获取所有具有给定ID的视频的文本格式的顶级评论(仅由于Youtube API的引用限制,获取前10条评论)
    参数:

    youtube:来自googleapiclient.discovery的构建对象
    video_ids:视频ID列表

    返回:
    具有视频ID和相关的顶级评论文本的数据帧。

    """

    all_comments = []

    for video_id in video_ids:
        try:
            request = youtube.commentThreads().list(
                part="snippet,replies",
                videoId=video_id
            )
            response = request.execute()

            comments_in_video = [comment['snippet']['topLevelComment']['snippet']['textOriginal'] for comment in
                                 response['items'][0:10]]
            comments_in_video_info = {'video_id': video_id, 'comments': comments_in_video}

            all_comments.append(comments_in_video_info)

        except:
            # 当出现错误时 - 最有可能是因为在视频上禁用了评论
            print('无法获取视频的评论:' + video_id)

    return pd.DataFrame(all_comments)

channel_data = get_channel_stats(youtube, channel_ids)

# 将计数列转换为数值列
numeric_cols = ['subscribers', 'views', 'totalVideos']
channel_data[numeric_cols] = channel_data[numeric_cols].apply(pd.to_numeric, errors='coerce')

# 创建包含所有频道的视频统计和评论数据的数据帧

video_df = pd.DataFrame()
comments_df = pd.DataFrame()

for c in channel_data['channelName'].unique():
    print("从频道获取视频信息:" + c)
    playlist_id = channel_data.loc[channel_data['channelName'] == c, 'playlistId'].iloc[0]
    video_ids = get_video_ids(youtube, playlist_id)

    # 获取视频数据
    video_data = get_video_details(youtube, video_ids)
    # 获取评论数据
    comments_data = get_comments_in_videos(youtube, video_ids)

    # 将视频数据和评论数据合并在一起
    video_df = video_df.append(video_data, ignore_index=True)
    comments_df = comments_df.append(comments_data, ignore_index=True)

print(video_df)

非空的video_df数据帧和数据导出到Excel。

英文:

I am new to Python and am trying to learn it to scrape data off of YouTube. I got this program from a very good YouTube tutorial video; it isn't showing any errors but is returning an empty dataframe for video_df. Looking for advice as to why this is happening and what I can do to correct this?


import pandas as pd
import numpy as np
# Google API
from googleapiclient.discovery import build
api_key = &#39;&lt;api key&gt;&#39;
channel_ids = [&#39;&lt;channel_id&gt;&#39;,
]
youtube = build(&#39;youtube&#39;, &#39;v3&#39;, developerKey=api_key)
def get_channel_stats(youtube, channel_ids):
&quot;&quot;&quot;
Get channel statistics: title, subscriber count, view count, video count, upload playlist
Params:
youtube: the build object from googleapiclient.discovery
channels_ids: list of channel IDs
Returns:
Dataframe containing the channel statistics for all channels in the provided list: title, subscriber count, view count, video count, upload playlist
&quot;&quot;&quot;
all_data = []
request = youtube.channels().list(
part=&#39;snippet,contentDetails,statistics&#39;,
id=&#39;,&#39;.join(channel_ids))
response = request.execute()
for i in range(len(response[&#39;items&#39;])):
data = dict(channelName=response[&#39;items&#39;][i][&#39;snippet&#39;][&#39;title&#39;],
subscribers=response[&#39;items&#39;][i][&#39;statistics&#39;][&#39;subscriberCount&#39;],
views=response[&#39;items&#39;][i][&#39;statistics&#39;][&#39;viewCount&#39;],
totalVideos=response[&#39;items&#39;][i][&#39;statistics&#39;][&#39;videoCount&#39;],
playlistId=response[&#39;items&#39;][i][&#39;contentDetails&#39;][&#39;relatedPlaylists&#39;][&#39;uploads&#39;])
all_data.append(data)
return pd.DataFrame(all_data)
def get_video_ids(youtube, playlist_id):
&quot;&quot;&quot;
Get list of video IDs of all videos in the given playlist
Params:
youtube: the build object from googleapiclient.discovery
playlist_id: playlist ID of the channel
Returns:
List of video IDs of all videos in the playlist
&quot;&quot;&quot;
request = youtube.playlistItems().list(
part=&#39;contentDetails&#39;,
playlistId=playlist_id,
maxResults=50)
response = request.execute()
video_ids = []
for i in range(len(response[&#39;items&#39;])):
video_ids.append(response[&#39;items&#39;][i][&#39;contentDetails&#39;][&#39;videoId&#39;])
next_page_token = response.get(&#39;nextPageToken&#39;)
more_pages = True
while more_pages:
if next_page_token is None:
more_pages = False
else:
request = youtube.playlistItems().list(
part=&#39;contentDetails&#39;,
playlistId=playlist_id,
maxResults=50,
pageToken=next_page_token)
response = request.execute()
for i in range(len(response[&#39;items&#39;])):
video_ids.append(response[&#39;items&#39;][i][&#39;contentDetails&#39;][&#39;videoId&#39;])
next_page_token = response.get(&#39;nextPageToken&#39;)
return video_ids
def get_video_details(youtube, video_ids):
&quot;&quot;&quot;
Get video statistics of all videos with given IDs
Params:
youtube: the build object from googleapiclient.discovery
video_ids: list of video IDs
Returns:
Dataframe with statistics of videos, i.e.:
&#39;channelTitle&#39;, &#39;title&#39;, &#39;description&#39;, &#39;tags&#39;, &#39;publishedAt&#39;
&#39;viewCount&#39;, &#39;likeCount&#39;, &#39;favoriteCount&#39;, &#39;commentCount&#39;
&#39;duration&#39;, &#39;definition&#39;, &#39;caption&#39;
&quot;&quot;&quot;
all_video_info = []
for i in range(0, len(video_ids), 50):
request = youtube.videos().list(
part=&quot;snippet,contentDetails,statistics&quot;,
id=&#39;,&#39;.join(video_ids[i:i + 50])
)
response = request.execute()
for video in response[&#39;items&#39;]:
stats_to_keep = {&#39;snippet&#39;: [&#39;channelTitle&#39;, &#39;title&#39;, &#39;description&#39;, &#39;tags&#39;, &#39;publishedAt&#39;],
&#39;statistics&#39;: [&#39;viewCount&#39;, &#39;likeCount&#39;, &#39;favouriteCount&#39;, &#39;commentCount&#39;],
&#39;contentDetails&#39;: [&#39;duration&#39;, &#39;definition&#39;, &#39;caption&#39;]
}
video_info = {}
video_info[&#39;video_id&#39;] = video[&#39;id&#39;]
for k in stats_to_keep.keys():
for v in stats_to_keep[k]:
try:
video_info[v] = video[k][v]
except:
video_info[v] = None
all_video_info.append(video_info)
def get_comments_in_videos(youtube, video_ids):
&quot;&quot;&quot;
Get top level comments as text from all videos with given IDs (only the first 10 comments due to quote limit of Youtube API)
Params:
youtube: the build object from googleapiclient.discovery
video_ids: list of video IDs
Returns:
Dataframe with video IDs and associated top level comment in text.
&quot;&quot;&quot;
all_comments = []
for video_id in video_ids:
try:
request = youtube.commentThreads().list(
part=&quot;snippet,replies&quot;,
videoId=video_id
)
response = request.execute()
comments_in_video = [comment[&#39;snippet&#39;][&#39;topLevelComment&#39;][&#39;snippet&#39;][&#39;textOriginal&#39;] for comment in
response[&#39;items&#39;][0:10]]
comments_in_video_info = {&#39;video_id&#39;: video_id, &#39;comments&#39;: comments_in_video}
all_comments.append(comments_in_video_info)
except:
# When error occurs - most likely because comments are disabled on a video
print(&#39;Could not get comments for video &#39; + video_id)
return pd.DataFrame(all_comments)
channel_data = get_channel_stats(youtube, channel_ids)
# Convert count columns to numeric columns
numeric_cols = [&#39;subscribers&#39;, &#39;views&#39;, &#39;totalVideos&#39;]
channel_data[numeric_cols] = channel_data[numeric_cols].apply(pd.to_numeric, errors=&#39;coerce&#39;)
# Create a dataframe with video statistics and comments from all channels
video_df = pd.DataFrame()
comments_df = pd.DataFrame()
for c in channel_data[&#39;channelName&#39;].unique():
print(&quot;Getting video information from channel: &quot; + c)
playlist_id = channel_data.loc[channel_data[&#39;channelName&#39;] == c, &#39;playlistId&#39;].iloc[0]
video_ids = get_video_ids(youtube, playlist_id)
# get video data
video_data = get_video_details(youtube, video_ids)
# get comment data
comments_data = get_comments_in_videos(youtube, video_ids)
# append video data together and comment data toghether
video_df = video_df.append(video_data, ignore_index=True)
comments_df = comments_df.append(comments_data, ignore_index=True)
print(video_df)

Non-empty dataframe for video_df and data export to excel

答案1

得分: 1

你在函数 get_video_details() 中缺少了一个返回语句。

我猜你想在函数末尾添加 return pd.DataFrame(all_video_info)

英文:

You miss a return statement in your function get_video_details().

I guess you want to add return pd.DataFrame(all_video_info) to the end of the function.

huangapple
  • 本文由 发表于 2023年3月7日 20:26:25
  • 转载请务必保留本文链接:https://go.coder-hub.com/75661965.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定