This week I try to use python collect all VR related news in TechCrunch. There are so many news in TechCrunch I don’t want to click them one by one. So I use python open page by page and find the target, go inside and paste all images in a HTML file.

Here’s my code:

import time
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob

def downloadImgs(url):
	html = requests.get(url).text
	soup = BeautifulSoup(html, 'html.parser')
	all_imgs ='.article-entry')[0].select('img');

	for img in all_imgs:
		print img, "<br>"

def search_all_content(url):
	html = requests.get(url).text
	soup = BeautifulSoup(html, 'html.parser')
	total_titles ='.post-title a')

	for title in total_titles:
		blob = TextBlob(title.text)
		if blob.words.count('vr')>0:
			print "<div>", title, "<br>"
			print "</div>"

base_url = ''

print '''<!DOCTYPE html><html><head>
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
		padding: 30px;
		position: relative;
		font-size: 25px;
		font-weight: bold;
		color: black;
		text-decoration: none;
		padding-bottom: 5px;
		text-decoration: underline;

for pagenumber in range(1,200):
	search_all_content(base_url + str(pagenumber))

print '</body></html>'

I save all the new’s title and images to a HTML file.

Click here to view

Leave a Reply

Your email address will not be published. Required fields are marked *