newgrounds movie scrapper python script

2024-02-25 23:10:44 +01:00
parent c9c39aab7b
commit ad05e7ad64
1 changed files with 105 additions and 0 deletions
--- a/newgrounds-movie-scrapper.py
+++ b/newgrounds-movie-scrapper.py
@@ -0,0 +1,105 @@
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from bs4 import BeautifulSoup
+import asyncio
+import requests
+import os
+import time 
+
+
+def scroll_and_scrape(driver, url):
+    driver.get("https://www.newgrounds.com/")
+
+    print("Please login to your Newgrounds account and press enter to continue.")
+    input()
+
+    driver.get(url)
+    print("Scrolling down the page to load all the movies...")
+    body = driver.find_element(By.CSS_SELECTOR, 'body')
+
+    for _ in range(30):
+        body.send_keys(Keys.PAGE_DOWN)
+        time.sleep(0.2)
+
+    page_content = driver.page_source
+    return page_content
+
+
+def find_links(page_content, class_name):
+    soup = BeautifulSoup(page_content, 'html.parser')
+    links = soup.find_all('a', class_=class_name)
+    for i in range(links.__len__()):
+        print(i+1, links[i]['href'])
+    return links
+
+
+def find_src(driver, link, class_name):
+    driver.get(link)
+
+    video_element = driver.find_element(By.ID, class_name)
+    if video_element is None:
+        return None
+
+    try:
+        video_element.click()
+    except:
+        return None
+    time.sleep(1)
+
+    soup = BeautifulSoup(driver.page_source, 'html.parser')
+    links = soup.find_all('source', type='video/mp4')
+
+    if ("uploads" not in links[0]["src"]):
+        return None
+    
+    src = links[0]['src']
+
+    src = src.split('?')[0]
+    return src
+
+
+async def download_video(location, url):
+    name = url.split('/')[-1]
+    response = requests.get(url)
+
+    with open(location + '/' + name, 'wb') as f:
+        f.write(response.content)
+
+
+
+
+async def main():
+    movies_url = 'https://derpixon.newgrounds.com/movies'
+
+    folder_name = movies_url.split('.')[0].replace('https://','')
+    os.makedirs(folder_name, exist_ok=True)
+
+    driver = webdriver.Chrome()
+    page_content = scroll_and_scrape(driver, movies_url)
+
+    print("Scraping movie links...")
+    links = find_links(page_content, 'inline-card-portalsubmission')
+
+
+    print("Scraping the video source & downloading the videos in background...\n")
+
+    dl_tasks = []
+    for i in range(links.__len__()):
+        time.sleep(3)
+        print("\033[AVideo ", i+1, " of ", links.__len__())
+        video_url = find_src(driver, links[i]['href'], 'ng-global-video-player')
+        if video_url is not None:
+            dl_tasks.append(asyncio.create_task(download_video(folder_name, video_url)))
+
+    driver.quit()
+    print("Downloading ", dl_tasks.__len__, " videos...")
+    await asyncio.gather(*dl_tasks)
+    print("Done.")
+
+
+
+
+
+if __name__ == "__main__":
+    asyncio.run(main())