mentorchains · YasaminAbbaszadegan · Dec 3, 2020 · Dec 4, 2020 · Dec 12, 2020 · Dec 12, 2020
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+.DS_Store
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/Sourav_Naskar/Codeacademy_Webscrapper_20210107154307_final.csv b/Sourav_Naskar/Codeacademy_Webscrapper_20210107154307_final.csv
diff --git a/Sourav_Naskar/Data_cleaning & EDA_Codeacademy_final.ipynb b/Sourav_Naskar/Data_cleaning & EDA_Codeacademy_final.ipynb
diff --git a/Sourav_Naskar/Webscraping_Codeacademy_final.py b/Sourav_Naskar/Webscraping_Codeacademy_final.py
@@ -0,0 +1,214 @@
+from selenium import webdriver
+from selenium.webdriver import Chrome
+from selenium.webdriver.chrome.options import Options
+from bs4 import BeautifulSoup
+import requests
+import time
+from datetime import datetime
+import os
+import pandas as pd
+
+
+class CodeacademyWebscraper:
+    driver = None                   # Selenium webdriver object
+    topicDict = {}                  # Dictionary of all topics and their attributes
+    topicDataframe = \
+        pd.DataFrame(columns=[      # Pandas dataframe of all topic attributes
+        'Topic Title', 
+        'Category', 
+        'Tags', 
+        'Leading Comment', 
+        'Other Comments',
+        'Likes',
+        'Views'])
+
+
+    def __init__(self, webdriverPath):
+        # Set up webdriver
+        options = webdriver.ChromeOptions()
+        options.add_argument('--ignore-certificate-errors')     # Ignore security certificates
+        options.add_argument('--incognito')                     # Use Chrome in Incognito mode
+        options.add_argument('--headless')                      # Run in background
+        self.driver = webdriver.Chrome( \
+            executable_path = webdriverPath, \
+            options = options)
+
+    def get_title(self, topicSoup):
+        topicName = topicSoup.find('a', class_='fancy-title').text
+
+        # Remove leading and trailing spaces and newlines
+        topicName = topicName.replace('\n', '').strip()
+        return topicName
+
+    def get_category_and_tags(self, topicSoup):    
+        topicCategoryDiv = topicSoup.find('div', class_='topic-category ember-view')
+        tagAnchors = topicCategoryDiv.find_all('span', class_='category-name')
+
+        tagList = []
+        for anchor in tagAnchors:
+            tagList.append(anchor.text)
+
+        if (len(tagList) == 1):
+            category = tagList[0]
+            tags = []
+            return category, tags
+        else:
+            category = tagList[0]
+            tags = tagList[1:]
+            return category, tags
+
+
+    def get_comments(self, topicSoup):
+        # Get all the posts HTML
+        comment = topicSoup.find_all('div', class_='cooked')
+        comments = []
+        temp = ''
+        for ele in comment:
+            temp += ele.get_text()
+            comments.append(temp)
+        try:
+            leading_comment = comments[0]
+            if len(comments) == 1:
+                other_comments = []
+            else:
+                other_comments = comments[1:]
+        except:
+            leading_comment, other_comments = [], []
+
+        return leading_comment, other_comments
+
+
+    def get_views(self, topicSoup):
+        views = topicSoup.find('li', class_='secondary views')
+        if views == None:
+            return str(0)
+        return views.span.text
+
+
+    def get_likes(self, topicSoup):
+        likes = topicSoup.find('li', class_='secondary likes')
+        if likes == None:
+            return str(0)
+        return likes.span.text
+
+
+    def runApplication(self, baseURL):
+        # Open Chrome web client using Selenium and retrieve page source
+        self.driver.get(baseURL)
+        # Get all the categories link 
+        categ_links = self.driver.find_elements_by_css_selector('.category > h3 > a')
+        categ_urls = []
+        for link in categ_links:
+            categ_urls.append(link.get_attribute('href'))
+
+        # Go over each category url
+        for categ_url in categ_urls:
+            # Access category webpage
+            self.driver.get(categ_url)
+
+            # Load the entire webage by scrolling to the bottom
+            lastHeight = self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
+            while (True):
+                # Scroll to bottom of page
+                self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+
+                # Wait for new page segment to load
+                time.sleep(0.5)
+
+                # Calculate new scroll height and compare with last scroll height
+                newHeight = self.driver.execute_script("return document.body.scrollHeight")
+                if newHeight == lastHeight:
+                    break
+                lastHeight = newHeight
+
+            # Generate category soup object
+            categoryHTML = self.driver.page_source
+            categorySoup = BeautifulSoup(categoryHTML, 'html.parser')
+
+            # Find all anchor objects that contain topic information
+            topicAnchors = categorySoup.find_all('a', class_='title raw-link raw-topic-link')
+
+            # Get hyperlink references and append it to the base URL to get the topic page URLs
+            topicPageURLs = []
+            for i in range(len(topicAnchors)):
+                href = topicAnchors[i]['href']
+                topicPageURLs.append(baseURL + href)
+
+
+            # 2nd for loop to loop through all topics in a category
+            for topicURL in topicPageURLs:
+                # Get topic HTML text and generate topic soup object
+                self.driver.get(topicURL)
+                topicHTML = self.driver.page_source
+                topicSoup = BeautifulSoup(topicHTML, 'html.parser')
+
+                # Scape all topic attributes of interest
+                topicTitle = self.get_title(topicSoup)
+                category, tags = self.get_category_and_tags(topicSoup)
+
+                leadingComment, otherComments = self.get_comments(topicSoup)
+                numLikes = self.get_likes(topicSoup)
+                numViews = self.get_views(topicSoup)
+
+                # Create attribute dictionary for topic
+                attributeDict = {
+                    'Topic Title'       :   topicTitle,
+                    'Category'          :   category,
+                    'Tags'              :   tags,
+                    'Leading Comment'   :   leadingComment,
+                    'Other Comments'    :   otherComments,
+                    'Likes'             :   numLikes,
+                    'Views'             :   numViews}
+
+                # Add the new entry to the topic dictionary and Pandas dataframe
+                self.topicDict[topicTitle] = attributeDict
+                self.topicDataframe = self.topicDataframe.append(attributeDict, ignore_index=True)
+
+
+                print('Topic Title:')
+                print(topicTitle)
+                print('Category:')
+                print(category)
+                print('Tags:')
+                print(tags)
+
+                print('Leading Comment:')
+                print(leadingComment)
+
+                print('Other Comments:')
+                print(otherComments)
+                print('Likes:')
+                print(numLikes)
+                print('Views:')
+                print(numViews)
+
+
+        # Get unique timestamp of the webscraping
+        timeStamp = datetime.now().strftime('%Y%m%d%H%M%S')
+
+        # Save data in CSV file and store in the save folder as this program
+
+        csvFilename = 'Codeacademy_Webscrapper_' + timeStamp + '.csv'
+
+
+        csvFileFullPath = os.path.join(os.path.dirname(os.path.abspath("__file__")), csvFilename)
+
+
+
+        self.topicDataframe.to_csv(csvFileFullPath)
+
+
+
+if __name__=='__main__':
+    # Local path to webdriver
+    webdriverPath = 'C:\Program Files (x86)\chromedriver.exe'
+
+    # Codeacademy forum base URL
+    baseURL = 'https://discuss.codecademy.com/'
+
+
+    # Create Codeacademy forum webscraping object
+    codeacademyWebscraper = CodeacademyWebscraper(webdriverPath)
+
+    # Run webscraping and save data
+    codeacademyWebscraper.runApplication(baseURL)
diff --git a/YIZHEN_KU/md2.py b/YIZHEN_KU/md2.py
@@ -0,0 +1,52 @@
+from bs4 import BeautifulSoup
+import urllib.request
+import requests
+
+### scraping  ###
+url = "https://sellercentral.amazon.com/forums/"
+html = requests.get(url)
+soup = BeautifulSoup(html.text,'html.parser')
+t1 = soup.find_all('a')
+
+href_list = []
+for t2 in t1:
+  t3 = t2.get('href')
+  href_list.append(t3)
+print(href_list)
+
+original="https://sellercentral.amazon.com"
+# complete URL
+def geturl(incompletelist,completelist):
+  original="https://sellercentral.amazon.com"
+  for i in range(len(incompletelist)):
+    completelist.append(original+incompletelist[i])
+
+href_list_full=[]
+geturl(href_list,href_list_full)
+print(href_list_full)
+
+nameofboard=[]
+for i in range(len(href_list)):
+  nameofboard.append(href_list[i].rpartition('/')[-1])
+# dictionary for all subcategory in amazon service forum
+dictamz = dict(zip(nameofboard,href_list_full))
+# print(dictamz)
+
+url = dictamz[nameofboard[1]]
+#url = "https://sellercentral.amazon.com/forums/c/selling-on-amazon"
+html = requests.get(url)
+soup = BeautifulSoup(html.text,'html.parser')
+
+### failure---01 (nothing in the player)
+players = [elem.text for elem in soup.find_all('td')]
+players_list = soup.find_all('td')
+for player in players_list:
+    print(player.text)
+### failure---02 (no table)
+import requests
+from io import StringIO
+import pandas as pd
+import numpy as np
+
+url = 'https://sellercentral.amazon.com/forums/c/selling-on-amazon'
+pd.read_html(url)
diff --git a/Yasamin_Abbaszadegan/Yasamin_abbaszadegan_Module2_EDA.ipynb b/Yasamin_Abbaszadegan/Yasamin_abbaszadegan_Module2_EDA.ipynb