diff --git a/newgrounds-movie-scrapper.py b/newgrounds-movie-scrapper.py new file mode 100755 index 0000000..ef5dfe1 --- /dev/null +++ b/newgrounds-movie-scrapper.py @@ -0,0 +1,105 @@ +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.common.keys import Keys +from bs4 import BeautifulSoup +import asyncio +import requests +import os +import time + + +def scroll_and_scrape(driver, url): + driver.get("https://www.newgrounds.com/") + + print("Please login to your Newgrounds account and press enter to continue.") + input() + + driver.get(url) + print("Scrolling down the page to load all the movies...") + body = driver.find_element(By.CSS_SELECTOR, 'body') + + for _ in range(30): + body.send_keys(Keys.PAGE_DOWN) + time.sleep(0.2) + + page_content = driver.page_source + return page_content + + +def find_links(page_content, class_name): + soup = BeautifulSoup(page_content, 'html.parser') + links = soup.find_all('a', class_=class_name) + for i in range(links.__len__()): + print(i+1, links[i]['href']) + return links + + +def find_src(driver, link, class_name): + driver.get(link) + + video_element = driver.find_element(By.ID, class_name) + if video_element is None: + return None + + try: + video_element.click() + except: + return None + time.sleep(1) + + soup = BeautifulSoup(driver.page_source, 'html.parser') + links = soup.find_all('source', type='video/mp4') + + if ("uploads" not in links[0]["src"]): + return None + + src = links[0]['src'] + + src = src.split('?')[0] + return src + + +async def download_video(location, url): + name = url.split('/')[-1] + response = requests.get(url) + + with open(location + '/' + name, 'wb') as f: + f.write(response.content) + + + + +async def main(): + movies_url = 'https://derpixon.newgrounds.com/movies' + + folder_name = movies_url.split('.')[0].replace('https://','') + os.makedirs(folder_name, exist_ok=True) + + driver = webdriver.Chrome() + page_content = scroll_and_scrape(driver, movies_url) + + print("Scraping movie links...") + links = find_links(page_content, 'inline-card-portalsubmission') + + + print("Scraping the video source & downloading the videos in background...\n") + + dl_tasks = [] + for i in range(links.__len__()): + time.sleep(3) + print("\033[AVideo ", i+1, " of ", links.__len__()) + video_url = find_src(driver, links[i]['href'], 'ng-global-video-player') + if video_url is not None: + dl_tasks.append(asyncio.create_task(download_video(folder_name, video_url))) + + driver.quit() + print("Downloading ", dl_tasks.__len__, " videos...") + await asyncio.gather(*dl_tasks) + print("Done.") + + + + + +if __name__ == "__main__": + asyncio.run(main())