<a href="https://colab.research.google.com/github/jakobzhao/geog458/blob/master/labs/lab02/youtube2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# created on April 14, 2021
# modified on Jan 2, 2022
# modified on April 20, 2023
# modified on January 5, 2024
# @author:          Bo Zhao
# @email:           zhaobo@uw.edu
# @website:         https://hgis.uw.edu
# @organization:    Department of Geography, University of Washington, Seattle
# @description:     A demo of collecting data from YouTube.

In [None]:
# Install and configure selenium on Google Colab that is built on Ubuntu 18.04. 
# To install selenium, you will need to update ubuntu, install some dependencies, and install chromedriver.
# selenium is a Python package that allows you to automate your browser.
# chromedriver is a program that allows you to control Chrome from Python.
# chromedriver_autoinstaller is a Python package that automatically installs chromedriver; it can help you determine which version of chromedriver to install based on your version of Chrome.
# %%shell means that the following code is a shell script.
%%shell
sudo apt -y update
sudo apt install -y wget curl unzip
wget http://archive.ubuntu.com/ubuntu/pool/main/libu/libu2f-host/libu2f-udev_1.1.4-1_all.deb
dpkg -i libu2f-udev_1.1.4-1_all.deb
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
dpkg -i google-chrome-stable_current_amd64.deb

pip install selenium chromedriver_autoinstaller

In [None]:
from bs4 import BeautifulSoup # Import BeautifulSoup to parse the HTML.
import time, datetime # Import time and datetime to record the time.
import pandas as pd # Import pandas to create a dataframe, and it can save the dataframe as a csv file.

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options() # Create an instance of Options so you can add arguments to the driver.
chrome_options.add_argument('--headless') # Add an argument 'headless' to run Chrome in headless mode.
chrome_options.add_argument('--no-sandbox') # Add an argument 'no-sandbox' to run Chrome in no-sandbox mode.
chrome_options.add_argument('--disable-dev-shm-usage') # Add an argument 'disable-dev-shm-usage' to run Chrome in disable-dev-shm-usage mode.

bot = webdriver.Chrome(options=chrome_options) # Create an instance of Chrome. Pass the argument 'options' to the constructor of Chrome.

In [None]:
# The url where the data will be collected from.
url = "https://www.youtube.com/results?search_query=standing+rock"

# Input the targeting url to the bot, and the bot will load data from the url.
bot.get(url)

# An array to store all the video urls. If a video has been crawled, it would not be stored to the data frame.
video_urls = []
# An array to store the retrieved video details.
results = []


# variable i indicates the number of times that scrolls down a web page. In practice, you might want to develop different
# interaction approach to load and view the web pages.

for i in range(5):

    # Create a document object model (DOM) from the raw source of the crawled web page.
    # Since you are processing a html page, 'html.parser' is chosen.
    soup = BeautifulSoup(bot.page_source, 'html.parser')

    # Capture all the video items using find_all or findAll method.
    # To view the information of the html elements you want to collect, you need to inspect the raw source using Chrome Inspector.
    # To test whether you find the right html elements, you can use the pycharm debugger to examine the returned data.
    videos = soup.find_all('ytd-video-renderer', class_="style-scope ytd-item-section-renderer")[-20:] # 20 indicates only process the newly-acquired 20 entries.

    # iterate and process each video entry.
    for video in videos:

        # I prefer use the "try-except" statement to enable the program run without pausing due to unexpected errors.
        try:
            # extract the video url, user url, username, title, view number, created time, short description, and collected time.
            # To determine the html elements, you need to inspect the raw source using Chrome Inspector. You can also use ChatGPT to find the html elements.
            video_url = video.find("a", class_="yt-simple-endpoint style-scope ytd-video-renderer").attrs["href"]
            user_url = video.find("a", class_="yt-simple-endpoint style-scope yt-formatted-string").attrs["href"]
            username = video.find("a", class_="yt-simple-endpoint style-scope yt-formatted-string").text
            title = video.find("yt-formatted-string", class_="style-scope ytd-video-renderer").text
            metadata_items = video.find_all("span", class_="inline-metadata-item style-scope ytd-video-meta-block")
            view_num = metadata_items[0].text.replace(" views", "")
            created_at = metadata_items[1].text.replace(" ago", "")
            shortdesc = video.find("yt-formatted-string", class_="metadata-snippet-text style-scope ytd-video-renderer").text
            collected_at = datetime.datetime.now()

            # create a row in the dict format.
            row = {'video_url': video_url,
                    'user_url': user_url,
                    'username': username,
                    'title': title,
                    'view_num': view_num,
                    'created_at': created_at,
                    'shortdesc': shortdesc,
                    'collected_at': collected_at}

            # if a video has been added, this video would not be inserted to the results array,
            # otherwise, this video will be inserted.
            if video_url in video_urls:
                print("this video has already been added.")
            else:
                print(row)
                video_urls.append(video_url)
                results.append(row)
        except:
            pass

    # it is very important to enable the bot take some rest, and then resume to work. 
    # It will make this data collection process more human-like, otherwise, the bot will be blocked by the website.
    time.sleep(5)

    # Let the bot scrolls down to the bottom of the content element, most of the time the bot needs to scroll down to the bottom of the page.
    bot.execute_script('window.scrollTo(0,  document.getElementById("content").scrollHeight);')

# terminate the bot object.
bot.close()

# Store the results as a pandas dataframe
df = pd.DataFrame(results)

# notify the completion of the crawling in the console.
print("the crawling task is finished.")

In [None]:
# Create data on to Google Drive
from google.colab import drive
# Mount your Drive to the Colab VM.
#drive._mount('/gdrive')
drive.mount('/gdrive')

# the file path where to store the output csv on google drive
output_file = '/gdrive/My Drive/videos.csv'

# Save the dataframe as a csv file
df.to_csv(output_file, index=False)

In [None]:
# download the csv to your local computer
from google.colab import files
files.download(output_file)
print("the csv has been downloaded to your local computer. The program has been completed successfully.")