newgrounds movie scrapper python script
This commit is contained in:
parent
c9c39aab7b
commit
ad05e7ad64
105
newgrounds-movie-scrapper.py
Executable file
105
newgrounds-movie-scrapper.py
Executable file
@ -0,0 +1,105 @@
|
|||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.common.keys import Keys
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import asyncio
|
||||||
|
import requests
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
def scroll_and_scrape(driver, url):
|
||||||
|
driver.get("https://www.newgrounds.com/")
|
||||||
|
|
||||||
|
print("Please login to your Newgrounds account and press enter to continue.")
|
||||||
|
input()
|
||||||
|
|
||||||
|
driver.get(url)
|
||||||
|
print("Scrolling down the page to load all the movies...")
|
||||||
|
body = driver.find_element(By.CSS_SELECTOR, 'body')
|
||||||
|
|
||||||
|
for _ in range(30):
|
||||||
|
body.send_keys(Keys.PAGE_DOWN)
|
||||||
|
time.sleep(0.2)
|
||||||
|
|
||||||
|
page_content = driver.page_source
|
||||||
|
return page_content
|
||||||
|
|
||||||
|
|
||||||
|
def find_links(page_content, class_name):
|
||||||
|
soup = BeautifulSoup(page_content, 'html.parser')
|
||||||
|
links = soup.find_all('a', class_=class_name)
|
||||||
|
for i in range(links.__len__()):
|
||||||
|
print(i+1, links[i]['href'])
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
def find_src(driver, link, class_name):
|
||||||
|
driver.get(link)
|
||||||
|
|
||||||
|
video_element = driver.find_element(By.ID, class_name)
|
||||||
|
if video_element is None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
video_element.click()
|
||||||
|
except:
|
||||||
|
return None
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(driver.page_source, 'html.parser')
|
||||||
|
links = soup.find_all('source', type='video/mp4')
|
||||||
|
|
||||||
|
if ("uploads" not in links[0]["src"]):
|
||||||
|
return None
|
||||||
|
|
||||||
|
src = links[0]['src']
|
||||||
|
|
||||||
|
src = src.split('?')[0]
|
||||||
|
return src
|
||||||
|
|
||||||
|
|
||||||
|
async def download_video(location, url):
|
||||||
|
name = url.split('/')[-1]
|
||||||
|
response = requests.get(url)
|
||||||
|
|
||||||
|
with open(location + '/' + name, 'wb') as f:
|
||||||
|
f.write(response.content)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
async def main():
|
||||||
|
movies_url = 'https://derpixon.newgrounds.com/movies'
|
||||||
|
|
||||||
|
folder_name = movies_url.split('.')[0].replace('https://','')
|
||||||
|
os.makedirs(folder_name, exist_ok=True)
|
||||||
|
|
||||||
|
driver = webdriver.Chrome()
|
||||||
|
page_content = scroll_and_scrape(driver, movies_url)
|
||||||
|
|
||||||
|
print("Scraping movie links...")
|
||||||
|
links = find_links(page_content, 'inline-card-portalsubmission')
|
||||||
|
|
||||||
|
|
||||||
|
print("Scraping the video source & downloading the videos in background...\n")
|
||||||
|
|
||||||
|
dl_tasks = []
|
||||||
|
for i in range(links.__len__()):
|
||||||
|
time.sleep(3)
|
||||||
|
print("\033[AVideo ", i+1, " of ", links.__len__())
|
||||||
|
video_url = find_src(driver, links[i]['href'], 'ng-global-video-player')
|
||||||
|
if video_url is not None:
|
||||||
|
dl_tasks.append(asyncio.create_task(download_video(folder_name, video_url)))
|
||||||
|
|
||||||
|
driver.quit()
|
||||||
|
print("Downloading ", dl_tasks.__len__, " videos...")
|
||||||
|
await asyncio.gather(*dl_tasks)
|
||||||
|
print("Done.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(main())
|
Loading…
x
Reference in New Issue
Block a user