Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Md2 #1

Open
wants to merge 39 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
d56466c
intial commit of scrapy project
shreya025 Dec 3, 2020
edeef88
Add files via upload
angela81ku Dec 4, 2020
99c55a6
added web scraping data-cleaning EDA files
sourav-naskar Dec 12, 2020
3ec4cdd
added web scraping files
sourav-naskar Dec 12, 2020
70ea705
added Web scraping data cleaning EDA files
sourav-naskar Dec 12, 2020
a74f34d
Add the webscraping and EDA tutorials
elateifsara Dec 12, 2020
9daa6a2
Pull the changes
elateifsara Dec 12, 2020
253db76
Clean up the folder and restructure
elateifsara Dec 12, 2020
2d94b47
Module2
YasaminAbbaszadegan Dec 14, 2020
799dd0f
updated webscraping data cleaning EDA files
sourav-naskar Dec 19, 2020
6821c53
updated files
sourav-naskar Dec 19, 2020
2fb5b70
updated files
sourav-naskar Dec 19, 2020
efd1c36
updated files
sourav-naskar Dec 19, 2020
0ebda3a
updated files
sourav-naskar Dec 19, 2020
79abadf
Merge branch 'md2' of https://github.com/mentorchains/level1_post_rec…
YasaminAbbaszadegan Dec 19, 2020
5addf75
Add text file
elateifsara Dec 20, 2020
621a626
Add Yasamin notebook to our repo
elateifsara Dec 20, 2020
7b46144
Add Flowster Forum Scraping Example notebook
elateifsara Dec 21, 2020
1724a71
Pull latest changes
elateifsara Dec 21, 2020
7d17c49
Remove sara_elateif folder (served as demonstration example)
elateifsara Dec 21, 2020
dcacd44
initial commit
Sachitt Jan 2, 2021
d0f918f
updated files
sourav-naskar Jan 2, 2021
56474e9
module two - forum spider and data cleaning
shreya025 Jan 2, 2021
eca50f1
Merge branch 'md2' of https://github.com/mentorchains/level1_post_rec…
shreya025 Jan 2, 2021
5dff8ce
Merge branch 'md2' of https://github.com/mentorchains/level1_post_rec…
Sachitt Jan 2, 2021
2a7b9ef
Merge branch 'md2' of https://github.com/mentorchains/level1_post_rec…
YasaminAbbaszadegan Jan 5, 2021
7aa8c37
Final_WebScraping_Version
YasaminAbbaszadegan Jan 5, 2021
ca52f8a
Merge branch 'md2' of https://github.com/mentorchains/level1_post_rec…
Sachitt Jan 11, 2021
4dce683
Restructuring file system
Sachitt Jan 11, 2021
56753d5
restructuring file system
Sachitt Jan 11, 2021
6e8b2d7
Added information acquired by scrolling
Sachitt Jan 11, 2021
825f033
More EDA
Sachitt Jan 11, 2021
5fbda64
Final version of Webscraping DataCleaning EDA files
sourav-naskar Jan 12, 2021
508729a
Merge branch 'md2' of https://github.com/mentorchains/level1_post_rec…
Sachitt Jan 13, 2021
e7f2681
renaming files and adding cleaneddata.csv to be used in md3
Sachitt Jan 13, 2021
f77f4c3
Added a csv file with stopwords to be used in md3
Sachitt Jan 16, 2021
9e46560
adjusted to clean amazon data
Sachitt Jan 30, 2021
1eda659
Add assets folder
elateifsara Jun 15, 2021
6bded57
Add new stuff from md2
elateifsara Jun 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
125,496 changes: 125,496 additions & 0 deletions Sourav_Naskar/Codeacademy_Webscrapper_20210107154307_final.csv

Large diffs are not rendered by default.

1,394 changes: 1,394 additions & 0 deletions Sourav_Naskar/Data_cleaning & EDA_Codeacademy_final.ipynb

Large diffs are not rendered by default.

214 changes: 214 additions & 0 deletions Sourav_Naskar/Webscraping_Codeacademy_final.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests
import time
from datetime import datetime
import os
import pandas as pd


class CodeacademyWebscraper:
driver = None # Selenium webdriver object
topicDict = {} # Dictionary of all topics and their attributes
topicDataframe = \
pd.DataFrame(columns=[ # Pandas dataframe of all topic attributes
'Topic Title',
'Category',
'Tags',
'Leading Comment',
'Other Comments',
'Likes',
'Views'])


def __init__(self, webdriverPath):
# Set up webdriver
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors') # Ignore security certificates
options.add_argument('--incognito') # Use Chrome in Incognito mode
options.add_argument('--headless') # Run in background
self.driver = webdriver.Chrome( \
executable_path = webdriverPath, \
options = options)

def get_title(self, topicSoup):
topicName = topicSoup.find('a', class_='fancy-title').text

# Remove leading and trailing spaces and newlines
topicName = topicName.replace('\n', '').strip()
return topicName

def get_category_and_tags(self, topicSoup):
topicCategoryDiv = topicSoup.find('div', class_='topic-category ember-view')
tagAnchors = topicCategoryDiv.find_all('span', class_='category-name')

tagList = []
for anchor in tagAnchors:
tagList.append(anchor.text)

if (len(tagList) == 1):
category = tagList[0]
tags = []
return category, tags
else:
category = tagList[0]
tags = tagList[1:]
return category, tags


def get_comments(self, topicSoup):
# Get all the posts HTML
comment = topicSoup.find_all('div', class_='cooked')
comments = []
temp = ''
for ele in comment:
temp += ele.get_text()
comments.append(temp)
try:
leading_comment = comments[0]
if len(comments) == 1:
other_comments = []
else:
other_comments = comments[1:]
except:
leading_comment, other_comments = [], []

return leading_comment, other_comments


def get_views(self, topicSoup):
views = topicSoup.find('li', class_='secondary views')
if views == None:
return str(0)
return views.span.text


def get_likes(self, topicSoup):
likes = topicSoup.find('li', class_='secondary likes')
if likes == None:
return str(0)
return likes.span.text


def runApplication(self, baseURL):
# Open Chrome web client using Selenium and retrieve page source
self.driver.get(baseURL)
# Get all the categories link
categ_links = self.driver.find_elements_by_css_selector('.category > h3 > a')
categ_urls = []
for link in categ_links:
categ_urls.append(link.get_attribute('href'))

# Go over each category url
for categ_url in categ_urls:
# Access category webpage
self.driver.get(categ_url)

# Load the entire webage by scrolling to the bottom
lastHeight = self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
while (True):
# Scroll to bottom of page
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

# Wait for new page segment to load
time.sleep(0.5)

# Calculate new scroll height and compare with last scroll height
newHeight = self.driver.execute_script("return document.body.scrollHeight")
if newHeight == lastHeight:
break
lastHeight = newHeight

# Generate category soup object
categoryHTML = self.driver.page_source
categorySoup = BeautifulSoup(categoryHTML, 'html.parser')

# Find all anchor objects that contain topic information
topicAnchors = categorySoup.find_all('a', class_='title raw-link raw-topic-link')

# Get hyperlink references and append it to the base URL to get the topic page URLs
topicPageURLs = []
for i in range(len(topicAnchors)):
href = topicAnchors[i]['href']
topicPageURLs.append(baseURL + href)


# 2nd for loop to loop through all topics in a category
for topicURL in topicPageURLs:
# Get topic HTML text and generate topic soup object
self.driver.get(topicURL)
topicHTML = self.driver.page_source
topicSoup = BeautifulSoup(topicHTML, 'html.parser')

# Scape all topic attributes of interest
topicTitle = self.get_title(topicSoup)
category, tags = self.get_category_and_tags(topicSoup)

leadingComment, otherComments = self.get_comments(topicSoup)
numLikes = self.get_likes(topicSoup)
numViews = self.get_views(topicSoup)

# Create attribute dictionary for topic
attributeDict = {
'Topic Title' : topicTitle,
'Category' : category,
'Tags' : tags,
'Leading Comment' : leadingComment,
'Other Comments' : otherComments,
'Likes' : numLikes,
'Views' : numViews}

# Add the new entry to the topic dictionary and Pandas dataframe
self.topicDict[topicTitle] = attributeDict
self.topicDataframe = self.topicDataframe.append(attributeDict, ignore_index=True)


print('Topic Title:')
print(topicTitle)
print('Category:')
print(category)
print('Tags:')
print(tags)

print('Leading Comment:')
print(leadingComment)

print('Other Comments:')
print(otherComments)
print('Likes:')
print(numLikes)
print('Views:')
print(numViews)


# Get unique timestamp of the webscraping
timeStamp = datetime.now().strftime('%Y%m%d%H%M%S')

# Save data in CSV file and store in the save folder as this program

csvFilename = 'Codeacademy_Webscrapper_' + timeStamp + '.csv'


csvFileFullPath = os.path.join(os.path.dirname(os.path.abspath("__file__")), csvFilename)



self.topicDataframe.to_csv(csvFileFullPath)



if __name__=='__main__':
# Local path to webdriver
webdriverPath = 'C:\Program Files (x86)\chromedriver.exe'

# Codeacademy forum base URL
baseURL = 'https://discuss.codecademy.com/'


# Create Codeacademy forum webscraping object
codeacademyWebscraper = CodeacademyWebscraper(webdriverPath)

# Run webscraping and save data
codeacademyWebscraper.runApplication(baseURL)
52 changes: 52 additions & 0 deletions YIZHEN_KU/md2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
from bs4 import BeautifulSoup
import urllib.request
import requests

### scraping ###
url = "https://sellercentral.amazon.com/forums/"
html = requests.get(url)
soup = BeautifulSoup(html.text,'html.parser')
t1 = soup.find_all('a')

href_list = []
for t2 in t1:
t3 = t2.get('href')
href_list.append(t3)
print(href_list)

original="https://sellercentral.amazon.com"
# complete URL
def geturl(incompletelist,completelist):
original="https://sellercentral.amazon.com"
for i in range(len(incompletelist)):
completelist.append(original+incompletelist[i])

href_list_full=[]
geturl(href_list,href_list_full)
print(href_list_full)

nameofboard=[]
for i in range(len(href_list)):
nameofboard.append(href_list[i].rpartition('/')[-1])
# dictionary for all subcategory in amazon service forum
dictamz = dict(zip(nameofboard,href_list_full))
# print(dictamz)

url = dictamz[nameofboard[1]]
#url = "https://sellercentral.amazon.com/forums/c/selling-on-amazon"
html = requests.get(url)
soup = BeautifulSoup(html.text,'html.parser')

### failure---01 (nothing in the player)
players = [elem.text for elem in soup.find_all('td')]
players_list = soup.find_all('td')
for player in players_list:
print(player.text)
### failure---02 (no table)
import requests
from io import StringIO
import pandas as pd
import numpy as np

url = 'https://sellercentral.amazon.com/forums/c/selling-on-amazon'
pd.read_html(url)

Large diffs are not rendered by default.

Loading