英文:
How to get total number of posts of a subreddit using Python?
问题
我正在进行一个项目,需要使用PRAW来爬取Subreddit的内容。但我需要设置一个限制,以便只爬取指定数量的帖子。例如,如果我想爬取Subreddit gaming(https://www.reddit.com/r/gaming/),我需要设置限制为100,这样它将只爬取前100个帖子。但与此不同的是,我想先获取gaming Subreddit中的总帖子数,然后将该值设置为提取所有帖子的限制。我在互联网上搜索了Pushshift API,但不知道如何操作。任何帮助都将不胜感激!
以下是代码部分:
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from psaw import PushshiftAPI
load_dotenv(find_dotenv())
# 创建一个数据框
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
# Subreddit实例,用于Web爬取
reddit_read_only = praw.Reddit(client_id=os.environ.get("client_id"),
client_secret=os.environ.get("client_secret"),
user_agent=os.environ.get("user_agent"))
def main(name, value):
i = 0
subreddit = reddit_read_only.subreddit(name)
print(subreddit.created)
while i < value:
# 限制爬取的帖子数量
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
# 如果有评论,将保存在数据框中
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
# 如果帖子中没有评论,将保存"没有评论"
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
i += 1
name = 'Reddit_web_scrap_' + str(name) # 以特定名称保存文件
return name
if __name__ == "__main__":
print('#####################################################################')
print('############### Reddit Web Scrapping Started ########################')
print('#####################################################################')
print()
name = main('gaming', 50)
print()
print('Created {}.csv file!'.format(name))
print()
print('#####################################################################')
print('################# Reddit Web Scrapping Ended ########################')
print('#####################################################################')
我已经将限制设置为50,这将爬取前50个帖子。但我想爬取gaming中的所有帖子。如果我将限制设置为"None",则会抛出错误:
TypeError: '<' not supported between instances of 'int' and 'str'
这是合理的,我猜我不能使用限制为"None"。
英文:
I am working on a project where I have to scrap subreddit using PRAW. But I have to put limit so that it will scrap only that many posts. For example, if I want to scrap a subreddit gaming (https://www.reddit.com/r/gaming/) I have to give limit 100 so it scrap for first 100 posts. But instead, I want first the total number of posts in gaming subreddit and then that value I can set as a limit to extract all the posts. I have searched on internet about Pushshift API, but don't know how to do that. Any help is appreciated!
Following code:
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from psaw import PushshiftAPI
load_dotenv(find_dotenv())
#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
client_secret = os.environ.get("client_secret"),
user_agent = os.environ.get("user_agent"))
def main(name, value):
i = 0
subreddit = reddit_read_only.subreddit(name)
print(subreddit.created)
while i < value:
#Limits the scrapping for value number of posts
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
#If there are any comments, then it will be saved in dataframe
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
#If there are no comments in a post, then No comments will be stored
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
i += 1
# print(df)
name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
# df.to_csv(name + str('.csv'), index=False)
return name
if __name__ == "__main__":
print('#####################################################################')
print('############### Reddit Web Scrapping Started ########################')
print('#####################################################################')
print()
name = main('gaming', 50)
print()
print('Created {}.csv file!'.format(name))
print()
print('#####################################################################')
print('################# Reddit Web Scrapping Ended ########################')
print('#####################################################################')
I have put limit to 50 which will scrap first 50 posts. But I want to scrap all the posts that is available in gaming. If I put limit = "None", then it will throw me an error:
TypeError: '<' not supported between instances of 'int' and 'str'
And this is logical as well. So, I guess I won't be able to use limit = "None".
答案1
得分: 1
我已创建一个名为total_posts()的函数,使用Pushshift API,它将为特定子论坛提供可用的总帖子数量。
#导入依赖项
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from pmaw import PushshiftAPI
load_dotenv(find_dotenv())
#创建一个数据帧
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
#要爬取的子论坛实例
reddit_read_only = praw.Reddit(client_id=os.environ.get("client_id"),
client_secret=os.environ.get("client_secret"),
user_agent=os.environ.get("user_agent"))
def total_posts(name):
print("正在计算总帖子数")
print()
api = PushshiftAPI()
api_request_generator = api.search_submissions(subreddit='ChatGPT', score=">=0")
aita_submissions = pd.DataFrame([submission for submission in api_request_generator])
print("子论坛 {} 中的总帖子数为 {}".format(name, aita_submissions.shape[0]))
return aita_submissions.shape[0]
def main(name, value):
print('创建数据帧')
print()
i = 0
subreddit = reddit_read_only.subreddit(name)
while i < value:
#限制爬取的帖子数
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
#如果有评论,将其保存在数据帧中
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
#如果帖子中没有评论,则将存储“无评论”
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['无评论']]
i += 1
print(df)
name = 'Reddit_web_scrap_'+str(name) #以特定名称保存文件
df.to_csv(name + str('.csv'), index=False)
if __name__ == "__main__":
subreddit_name = 'gaming'
print('#####################################################################')
print('#### 开始 Reddit 网页抓取 for {}'.format(subreddit_name) + '####')
print('#####################################################################')
print()
posts_number = total_posts(subreddit_name)
print()
main(subreddit_name, posts_number)
print()
print('创建 {}.csv 文件!'.format(subreddit_name))
print()
print('#####################################################################')
print('################# Reddit 网页抓取结束 ########################')
print('#####################################################################')
英文:
I have created a function total_posts() with the help of Pushshift API, that will give me total number of posts avaialble for a particular subreddit.
#Importing Dependencies
import praw
import pandas as pd
import os
from dotenv import load_dotenv, find_dotenv
from pmaw import PushshiftAPI
load_dotenv(find_dotenv())
#Creating a dataframe
df = pd.DataFrame(columns=['Title', 'Number of comments', 'Comments'])
#Instance of subreddit to be web scraped
reddit_read_only = praw.Reddit(client_id = os.environ.get("client_id"),
client_secret = os.environ.get("client_secret"),
user_agent = os.environ.get("user_agent"))
def total_posts(name):
print("Calculating total number of posts")
print()
api = PushshiftAPI()
api_request_generator = api.search_submissions(subreddit='ChatGPT', score = ">=0")
aita_submissions = pd.DataFrame([submission for submission in api_request_generator])
print("Total number of posts in subreddit {} are {}".format(name, aita_submissions.shape[0]))
return aita_submissions.shape[0]
def main(name, value):
print('Creating dataframe')
print()
i = 0
subreddit = reddit_read_only.subreddit(name)
while i < value:
#Limits the scrapping for value number of posts
for submission in subreddit.hot(limit=value):
submission.comments.replace_more(limit=(value*30))
lst = []
#If there are any comments, then it will be saved in dataframe
if submission.num_comments != 0:
for comment in submission.comments.list():
lst.append(comment.body)
df.loc[i] = [submission.title, submission.num_comments, lst]
#If there are no comments in a post, then No comments will be stored
elif submission.num_comments == 0:
df.loc[i] = [submission.title, submission.num_comments, ['No comments']]
i += 1
print(df)
name = 'Reddit_web_scrap_'+str(name) #save the file with certain name
df.to_csv(name + str('.csv'), index=False)
if __name__ == "__main__":
subreddit_name = 'gaming'
print('#####################################################################')
print('#### Reddit Web Scrapping Started for {}'.format(subreddit_name) + '####')
print('#####################################################################')
print()
posts_number = total_posts(subreddit_name)
print()
main(subreddit_name, posts_number)
print()
print('Created {}.csv file!'.format(subreddit_name))
print()
print('#####################################################################')
print('################# Reddit Web Scrapping Ended ########################')
print('#####################################################################')
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论