web_crawler.py

# -*- coding: utf-8 -*-
"""Web Crawler.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1Yn5eFLHgQYP8OfPBgT-xnMpltFeXHXac

#Deep Learning
#2020PCS1009
**Assignment 1:** Web Crawler

**Text - Crawler**
"""

import requests
from bs4 import BeautifulSoup
import csv

url = ["https://tex.stackexchange.com/?tab=active",
       "https://tex.stackexchange.com/?tab=bounties",
       "https://tex.stackexchange.com/?tab=hot",
       "https://tex.stackexchange.com/?tab=week",
       "https://tex.stackexchange.com/?tab=month"]
       
Final_Result = []
for link in url:
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'lxml')
    Category = link.rsplit('=', 1)[-1]

    Question_list = soup.find(id="question-mini-list")
    Questions= Question_list.find_all(class_="question-summary")

    for qt in Questions:

        t = qt.find(class_="question-hyperlink")    
        Qsn = t.get_text()
        link = t['href'] 
        
        v = qt.find(class_="mini-counts")
        vote = v.get_text()
        
        s = qt.find(class_="status")
        s1 = s.find(class_="mini-counts")
        num_ans = s1.get_text()
        
        vi = qt.find(class_="views")
        vi1 = vi.find(class_="mini-counts")
        view = vi1.get_text()
        
        tag = qt.find(class_="tags")
        post_tag=tag.find(class_="post-tag")
        pt= post_tag.get_text()
        
        start = qt.find(class_="started")  
        rt = start.find(class_="relativetime")
        time = rt.get_text()
        
        Au  = start.get_text()
        Au1 = Au.rsplit('\n')[-2]
        Author = Au1.rsplit(' ')[-2]
        
        rs = start.find(class_="reputation-score")
        R_Score = rs.get_text()
        record = {
            'Question':Qsn,
            'Votes':vote,
            'number of answers':num_ans,
            'Views':view,
            'Tags':pt,
            'Category':Category,
            "Time": time,
            "Author Name": Author,
            "Reputation Score": R_Score,
            }
        Final_Result.append(record)
   
#Lets write these to a JSON file for now. 
with open('data.json', 'w') as outfile:
    json.dump(Final_Result, outfile, indent=4)

import json
import csv

with open('/content/data.json') as json_file:
	jsondata = json.load(json_file)

data_file = open('/content/data.csv', 'w', newline='')
csv_writer = csv.writer(data_file)

count = 0
for data in jsondata:
	if count == 0:
		header = data.keys()
		csv_writer.writerow(header)
		count += 1
	csv_writer.writerow(data.values())

data_file.close()

"""**Image Crawler**"""

from bs4 import *
import requests as rq
import os

r = rq.get("https://unsplash.com/s/photos/web")
soup = BeautifulSoup(r.content, 'html.parser')
#print(soup)
link = []

x = soup.find_all("img")
# print(x)

for img in x:
     link.append(img['src'])

# for l in link:
#   print(l)

os.mkdir('Bharati_Img_Crawler')
i = 1

for index, img_link in enumerate(link):
    if i <= 40:
        img_data = rq.get(img_link).content
        with open("Bharati_Img_Crawler/"+str(index+1)+'.jpg', 'wb+') as f:
            f.write(img_data)
        i += 1
    else:
        f.close()
        break

import shutil

shutil.rmtree('Bharati_Img_Crawler')

!zip -r ./Bharati_Img_Crawler.zip ./Bharati_Img_Crawler/

"""**Video Crawler**"""

from bs4 import *
import requests as rq
import os

r = rq.get("https://sample-videos.com/index.php#sample-mp4-video")
soup = BeautifulSoup(r.content, 'html.parser')
# print(soup)

link = []
temp = []
x = soup.find_all("a")
# print(x)

for a in x:
     link.append(a['href'])

# for l in link:
#    print(l)

temp = link[26:108]
# print(len(temp))

main_link = "https://sample-videos.com/"
final = []
for l in temp:
   final.append(main_link+l)

final

os.mkdir('Bharati_Video_Crawler')

def download_video_series(final):
	for link in final:
		file_name = link.split('/')[-1]
		r = requests.get(link, stream = True)
		with open("Bharati_Video_Crawler/"+file_name, 'wb') as f:
			for chunk in r.iter_content(chunk_size = 1024*1024):
				if chunk:
					f.write(chunk)
	return
download_video_series(final)

import shutil

shutil.rmtree('Bharati_Video_Crawler')

!zip -r ./Bharati_Video_Crawler.zip ./Bharati_Video_Crawler/