scripts/newgrounds-movie-scrapper.py

106 lines
2.6 KiB
Python
Executable File

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import asyncio
import requests
import os
import time
def scroll_and_scrape(driver, url):
driver.get("https://www.newgrounds.com/")
print("Please login to your Newgrounds account and press enter to continue.")
input()
driver.get(url)
print("Scrolling down the page to load all the movies...")
body = driver.find_element(By.CSS_SELECTOR, 'body')
for _ in range(30):
body.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
page_content = driver.page_source
return page_content
def find_links(page_content, class_name):
soup = BeautifulSoup(page_content, 'html.parser')
links = soup.find_all('a', class_=class_name)
for i in range(links.__len__()):
print(i+1, links[i]['href'])
return links
def find_src(driver, link, class_name):
driver.get(link)
video_element = driver.find_element(By.ID, class_name)
if video_element is None:
return None
try:
video_element.click()
except:
return None
time.sleep(1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
links = soup.find_all('source', type='video/mp4')
if ("uploads" not in links[0]["src"]):
return None
src = links[0]['src']
src = src.split('?')[0]
return src
async def download_video(location, url):
name = url.split('/')[-1]
response = requests.get(url)
with open(location + '/' + name, 'wb') as f:
f.write(response.content)
async def main():
movies_url = 'https://derpixon.newgrounds.com/movies'
folder_name = movies_url.split('.')[0].replace('https://','')
os.makedirs(folder_name, exist_ok=True)
driver = webdriver.Chrome()
page_content = scroll_and_scrape(driver, movies_url)
print("Scraping movie links...")
links = find_links(page_content, 'inline-card-portalsubmission')
print("Scraping the video source & downloading the videos in background...\n")
dl_tasks = []
for i in range(links.__len__()):
time.sleep(3)
print("\033[AVideo ", i+1, " of ", links.__len__())
video_url = find_src(driver, links[i]['href'], 'ng-global-video-player')
if video_url is not None:
dl_tasks.append(asyncio.create_task(download_video(folder_name, video_url)))
driver.quit()
print("Downloading ", dl_tasks.__len__, " videos...")
await asyncio.gather(*dl_tasks)
print("Done.")
if __name__ == "__main__":
asyncio.run(main())