scripts/newgrounds-movie-scrapper.py

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import asyncio
import requests
import os
import time


def scroll_and_scrape(driver, url):
    driver.get("https://www.newgrounds.com/")

    print("Please login to your Newgrounds account and press enter to continue.")
    input()

    driver.get(url)
    print("Scrolling down the page to load all the movies...")
    body = driver.find_element(By.CSS_SELECTOR, 'body')

    for _ in range(30):
        body.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.2)

    page_content = driver.page_source
    return page_content


def find_links(page_content, class_name):
    soup = BeautifulSoup(page_content, 'html.parser')
    links = soup.find_all('a', class_=class_name)
    for i in range(links.__len__()):
        print(i+1, links[i]['href'])
    return links


def find_src(driver, link, class_name):
    driver.get(link)

    video_element = driver.find_element(By.ID, class_name)
    if video_element is None:
        return None

    try:
        video_element.click()
    except:
        return None
    time.sleep(1)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    links = soup.find_all('source', type='video/mp4')

    if ("uploads" not in links[0]["src"]):
        return None

    src = links[0]['src']

    src = src.split('?')[0]
    return src


async def download_video(location, url):
    name = url.split('/')[-1]
    response = requests.get(url)

    with open(location + '/' + name, 'wb') as f:
        f.write(response.content)


async def main():
    movies_url = 'https://derpixon.newgrounds.com/movies'

    folder_name = movies_url.split('.')[0].replace('https://','')
    os.makedirs(folder_name, exist_ok=True)

    driver = webdriver.Chrome()
    page_content = scroll_and_scrape(driver, movies_url)

    print("Scraping movie links...")
    links = find_links(page_content, 'inline-card-portalsubmission')


    print("Scraping the video source & downloading the videos in background...\n")

    dl_tasks = []
    for i in range(links.__len__()):
        time.sleep(3)
        print("\033[AVideo ", i+1, " of ", links.__len__())
        video_url = find_src(driver, links[i]['href'], 'ng-global-video-player')
        if video_url is not None:
            dl_tasks.append(asyncio.create_task(download_video(folder_name, video_url)))

    driver.quit()
    print("Downloading ", dl_tasks.__len__, " videos...")
    await asyncio.gather(*dl_tasks)
    print("Done.")


if __name__ == "__main__":
    asyncio.run(main())