How To Get A Reddit Submission S Comments Using The Api
Reddit is a goldmine of unstructured human conversation — 100,000+ active communities discussing everything from machine learning to mechanical keyboards. For researchers, analysts, and NLP practitioners, Reddit data powers sentiment analysis, trend detection, market research, and training datasets. In this guide, I'll show you three practical approaches to scraping Reddit in 2026: the built-in JSON API, the PRAW library, and raw HTTP scraping. All with working Python code. Why Scrape Reddit?
Sentiment analysis: Track public opinion on brands, products, or events - Market research: Find what people actually say about competitors - NLP training data: Millions of labeled conversations (upvotes = quality signal) - Trend detection: Spot emerging topics before they hit mainstream - Academic research: Social network analysis, community dynamics - Content aggregation: Build curated feeds from niche subreddits Method 1: Reddit's JSON API (No Auth Required) Reddit has a little-known feature: append .json to almost any Reddit URL and you get structured JSON data back.
import requests import time def fetch_subreddit_posts(subreddit, sort="hot", limit=25): url = f"https://www.reddit.com/r/{subreddit}/{sort}.json" params = { "limit": limit, "raw_json": 1, # Prevents HTML encoding in responses } headers = { "User-Agent": "PythonScraper/1.0 (research project)" } response = requests.get(url, params=params, headers=headers) if response.status_code != 200: print(f"Error: HTTP {response.status_code}") return [] data = response.json() posts = [] for child in data["data"]["children"]: post = child["data"] posts.append({ "id": post["id"], "title": post["title"], "author": post["author"], "score": post["score"], "num_comments": post["num_comments"], "created_utc": post["created_utc"], "url": post["url"], "selftext": post["selftext"][:500], # First 500 chars "permalink": f"https://reddit.com{post['permalink']}", }) return posts # Usage posts = fetch_subreddit_posts("machinelearning", sort="top", limit=100) for post in posts[:5]: print(f"[{post['score']}] {post['title']}") Available Sort Options - /hot.json — Currently trending - /new.json — Most recent - /top.json — Highest scored (add?t=day|week|month|year|all ) - /rising.json — Gaining momentum - /controversial.json — Most debated Pagination with the after Parameter Reddit returns 25-100 posts per request.
To get more, use the after parameter with the last post's fullname (t3_ + id): def scrape_all_posts(subreddit, sort="new", max_posts=500): url = f"https://www.reddit.com/r/{subreddit}/{sort}.json" all_posts = [] after = None while len(all_posts) < max_posts: params = { "limit": 100, "raw_json": 1, } if after: params["after"] = after headers = {"User-Agent": "PythonScraper/1.0 (research)"} response = requests.get(url, params=params, headers=headers) if response.status_code == 429: print("Rate limited — waiting 60s...") time.sleep(60) continue if response.status_code != 200: break data = response.json() children = data["data"]["children"] if not children: break for child in children: post = child["data"] all_posts.append({ "id": post["id"], "title": post["title"], "author": post["author"], "score": post["score"], "num_comments": post["num_comments"], "selftext": post["selftext"], "created_utc": post["created_utc"], "subreddit": post["subreddit"], }) after = data["data"]["after"] if not after: break print(f"Fetched {len(all_posts)} posts so far...") time.sleep(2) # Reddit rate limit: ~30 req/min without auth return all_posts[:max_posts] # Scrape 500 recent posts from r/datascience posts = scrape_all_posts("datascience", sort="new", max_posts=500) print(f"Total: {len(posts)} posts") Scraping Comments Comments are where the real value is.
Here's how to get them: def fetch_post_comments(post_id, subreddit): url = f"https://www.reddit.com/r/{subreddit}/comments/{post_id}.json" headers = {"User-Agent": "PythonScraper/1.0 (research)"} params = {"raw_json": 1, "limit": 500} response = requests.get(url, params=params, headers=headers) if response.status_code != 200: return [] data = response.json() # Reddit returns [post_data, comments_data] comments_data = data[1]["data"]["children"] comments = [] parse_comments(comments_data, comments, depth=0) return comments def parse_comments(children, results, depth): for child in children: if child["kind"] != "t1": # Skip non-comment entries continue comment = child["data"] results.append({ "id": comment["id"], "author": comment["author"], "body": comment["body"], "score": comment["score"], "created_utc": comment["created_utc"], "depth": depth, }) # Recursively parse replies if comment.get("replies") and isinstance(comment["replies"], dict): reply_children = comment["replies"]["data"]["children"] parse_comments(reply_children, results, depth + 1) # Get all comments from a post comments = fetch_post_comments("abc123", "python") for c in comments[:10]: indent = " " * c["depth"] print(f"{indent}[{c['score']}] {c['author']}: {c['body'][:80]}") Method 2: PRAW (Python Reddit API Wrapper) For authenticated access with higher rate limits, use PRAW.
You'll need to create a Reddit app at https://www.reddit.com/prefs/apps/: import praw reddit = praw.Reddit( client_id="YOUR_CLIENT_ID", client_secret="YOUR_CLIENT_SECRET", user_agent="PythonResearch/1.0" ) # Fetch top posts subreddit = reddit.subreddit("artificial") for post in subreddit.top(time_filter="week", limit=50): print(f"[{post.score}] {post.title}") print(f" Comments: {post.num_comments}") print(f" Author: {post.author}") print() # Search across Reddit for post in reddit.subreddit("all").search("web scraping python", limit=25): print(f"r/{post.subreddit}: {post.title}") PRAW vs JSON API Searching Across Reddit The JSON API supports search too: def search_reddit(query, subreddit=None, sort="relevance", time_filter="all", limit=100): if subreddit: url = f"https://www.reddit.com/r/{subreddit}/search.json" else: url = "https://www.reddit.com/search.json" params = { "q": query, "sort": sort, "t": time_filter, "limit": min(limit, 100), "raw_json": 1, "restrict_sr": 1 if subreddit else 0, } headers = {"User-Agent": "PythonScraper/1.0 (research)"} response = requests.get(url, params=params, headers=headers) if response.status_code != 200: return [] results = [] for child in response.json()["data"]["children"]: post = child["data"] results.append({ "title": post["title"], "subreddit": post["subreddit"], "score": post["score"], "url": post["url"], "num_comments": post["num_comments"], }) return results # Search for scraping discussions results = search_reddit("web scraping best practices 2026", sort="top", time_filter="month") Rate Limits and How to Handle Them Reddit's rate limits: - Unauthenticated: ~30 requests per minute - Authenticated (OAuth): 60 requests per minute - With premium: 100 requests per minute Handling Rate Limits Gracefully import time from functools import wraps def rate_limited(max_per_minute=30): min_interval = 60.0 / max_per_minute def decorator(func): last_called = [0.0] @wraps(func) def wrapper(*args, **kwargs): elapsed = time.time() - last_called[0] wait = min_interval - elapsed if wait > 0: time.sleep(wait) result = func(*args, **kwargs) last_called[0] = time.time() return result return wrapper return decorator @rate_limited(max_per_minute=25) def safe_fetch(url, params, headers): return requests.get(url, params=params, headers=headers) Scaling Up: Using Proxies For large-scale scraping (thousands of posts across many subreddits), you'll hit rate limits fast.
Two solutions: Proxy Aggregation with ScrapeOps ScrapeOps routes your requests through the cheapest working proxy automatically: def fetch_via_scrapeops(target_url): params = { "api_key": "YOUR_SCRAPEOPS_KEY", "url": target_url, } response = requests.get("https://proxy.scrapeops.io/v1/", params=params) return response.json() Managed Scraping with ScraperAPI ScraperAPI handles proxy rotation, retries, and CAPTCHA solving: def fetch_via_scraperapi(target_url): params = { "api_key": "YOUR_SCRAPERAPI_KEY", "url": target_url, } response = requests.get("https://api.scraperapi.com", params=params) return response.json() Building a Complete Reddit Dataset Here's a full pipeline that scrapes posts and comments, then saves to JSON: import requests import json import time from datetime import datetime def build_reddit_dataset(subreddits, posts_per_sub=100, include_comments=True): dataset = [] for sub in subreddits: print(f"\n--- Scraping r/{sub} ---") posts = scrape_all_posts(sub, sort="top", max_posts=posts_per_sub) for i, post in enumerate(posts): entry = {**post, "subreddit": sub} if include_comments and post["num_comments"] > 0: time.sleep(2) comments = fetch_post_comments(post["id"], sub) entry["comments"] = comments print(f" [{i+1}/{len(posts)}] {post['title'][:50]}...
({len(comments)} comments)") else: entry["comments"] = [] dataset.append(entry) print(f"Completed r/{sub}: {len(posts)} posts") return dataset # Build a dataset from multiple subreddits subreddits = ["datascience", "machinelearning", "python", "webdev"] dataset = build_reddit_dataset( subreddits, posts_per_sub=50, include_comments=True ) # Save with metadata output = { "scraped_at": datetime.utcnow().isoformat(), "subreddits": subreddits, "total_posts": len(dataset), "total_comments": sum(len(p.get("comments", [])) for p in dataset), "posts": dataset, } with open("reddit_dataset.json", "w", encoding="utf-8") as f: json.dump(output, f, indent=2, ensure_ascii=False) print(f"\nDataset saved: {output['total_posts']} posts, {output['total_comments']} comments") Exporting to CSV for Analysis import csv def export_posts_csv(posts, filename="reddit_posts.csv"): fieldnames = ["id", "subreddit", "title", "author", "score", "num_comments", "created_utc", "selftext"] with open(filename, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore") writer.writeheader() writer.writerows(posts) print(f"Exported {len(posts)} posts to {filename}") def export_comments_csv(posts, filename="reddit_comments.csv"): fieldnames = ["post_id", "comment_id", "author", "body", "score", "depth"] with open(filename, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() for post in posts: for comment in post.get("comments", []): writer.writerow({ "post_id": post["id"], "comment_id": comment["id"], "author": comment["author"], "body": comment["body"], "score": comment["score"], "depth": comment["depth"], }) Legal and Ethical Considerations - Reddit's API Terms: Reddit's API terms of service require attribution and prohibit commercial use without agreement.
The JSON API is public but still governed by these terms - Rate limiting: Always respect rate limits — getting banned helps nobody - User privacy: Don't deanonymize users or link Reddit accounts to real identities - Content policies: Don't scrape private/quarantined subreddits - Data retention: Consider how long you store scraped data and who has access Wrapping Up Reddit scraping in 2026 comes down to three approaches: - JSON API ( .json suffix) — zero setup, great for quick scripts and small datasets - PRAW — higher rate limits, streaming support, better for production pipelines - Proxy-based scaling — when you need thousands of posts, use ScrapeOps for proxy aggregation or ScraperAPI for fully managed scraping The JSON API is where most people should start.
It requires no authentication, returns clean structured data, and handles 90% of use cases. Add PRAW when you need streaming or higher limits, and proxies when you're operating at scale. Happy scraping! Pro tip: For reliable proxy rotation and residential IPs, check out ThorData — they offer competitive rates for web scraping at scale. Top comments (0)
People Also Asked
- How to Scrape Reddit in 2026: Subreddits, Posts, Comments via ...
- Parsing Reddit Comments with PRAW | by Arthur Cavalcanti | Medium
- How To Scrape Reddit Comments - Marco’s Substack
- r/redditdev on Reddit: Getting *all* the comments in a post
- Get all comments from a specific reddit thread in python
- Using an API to Extract All Comments from a Reddit Post
- GitHub - dbeley/reddit-scraper: Various scripts to download ...
- The Complete Reddit API Tutorial - Extract Post Comments Step ...
How to Scrape Reddit in 2026: Subreddits, Posts, Comments via ...?
You'll need to create a Reddit app at https://www.reddit.com/prefs/apps/: import praw reddit = praw.Reddit( client_id="YOUR_CLIENT_ID", client_secret="YOUR_CLIENT_SECRET", user_agent="PythonResearch/1.0" ) # Fetch top posts subreddit = reddit.subreddit("artificial") for post in subreddit.top(time_filter="week", limit=50): print(f"[{post.score}] {post.title}") print(f" Comments: {post.num_comments}...
Parsing Reddit Comments with PRAW | by Arthur Cavalcanti | Medium?
Here's how to get them: def fetch_post_comments(post_id, subreddit): url = f"https://www.reddit.com/r/{subreddit}/comments/{post_id}.json" headers = {"User-Agent": "PythonScraper/1.0 (research)"} params = {"raw_json": 1, "limit": 500} response = requests.get(url, params=params, headers=headers) if response.status_code != 200: return [] data = response.json() # Reddit returns [post_data, comments_d...
How To Scrape Reddit Comments - Marco’s Substack?
Here's how to get them: def fetch_post_comments(post_id, subreddit): url = f"https://www.reddit.com/r/{subreddit}/comments/{post_id}.json" headers = {"User-Agent": "PythonScraper/1.0 (research)"} params = {"raw_json": 1, "limit": 500} response = requests.get(url, params=params, headers=headers) if response.status_code != 200: return [] data = response.json() # Reddit returns [post_data, comments_d...
r/redditdev on Reddit: Getting *all* the comments in a post?
import requests import time def fetch_subreddit_posts(subreddit, sort="hot", limit=25): url = f"https://www.reddit.com/r/{subreddit}/{sort}.json" params = { "limit": limit, "raw_json": 1, # Prevents HTML encoding in responses } headers = { "User-Agent": "PythonScraper/1.0 (research project)" } response = requests.get(url, params=params, headers=headers) if response.status_code != 200: print(f"Erro...
Get all comments from a specific reddit thread in python?
To get more, use the after parameter with the last post's fullname (t3_ + id): def scrape_all_posts(subreddit, sort="new", max_posts=500): url = f"https://www.reddit.com/r/{subreddit}/{sort}.json" all_posts = [] after = None while len(all_posts) < max_posts: params = { "limit": 100, "raw_json": 1, } if after: params["after"] = after headers = {"User-Agent": "PythonScraper/1.0 (research)"} response...