Initial Commit
This commit is contained in:
56
scrap.py
Executable file
56
scrap.py
Executable file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/python3.7
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import math
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
from bs4 import BeautifulSoup as bs
|
||||
|
||||
base_url = 'https://modarchive.org/index.php?query=54&request=search&search_type=genre&page={}'
|
||||
headers = {'User-Agent' : 'I want music, im sorry :c, I will do it slowly'}
|
||||
|
||||
with open("last_page", "r") as f:
|
||||
last_page = int(f.read())
|
||||
|
||||
for page_index in range(last_page + 1, 75):
|
||||
url = base_url.format(page_index)
|
||||
print('Requesting url {}'.format(url))
|
||||
|
||||
request = requests.get(url, headers = headers)
|
||||
page = request.content
|
||||
|
||||
soup = bs(page, 'html.parser')
|
||||
links = soup.find_all('a', title = 'Download')
|
||||
print('Found {} links to download'.format(len(links)))
|
||||
|
||||
for i, link in enumerate(links, start=1):
|
||||
remote_file = requests.get(link['href'], stream=True, headers = headers)
|
||||
content_disposition= remote_file.headers['content-disposition']
|
||||
filename = re.findall("filename=(.+)", content_disposition)[0]
|
||||
|
||||
if os.path.isfile('mods/' + filename) :
|
||||
print('{} already exists, skiping'.format(filename))
|
||||
continue
|
||||
|
||||
file_size = int(remote_file.headers['content-length'])
|
||||
block_size = 1024
|
||||
wrote = 0
|
||||
|
||||
|
||||
with open('mods/' + filename, 'wb') as handle:
|
||||
for data in tqdm(remote_file.iter_content(block_size), desc='{} -> {}'.format(i, link['href']), total = math.ceil(file_size//block_size), unit = 'KB', unit_scale = True, miniters = 0, mininterval = 0):
|
||||
handle.write(data)
|
||||
|
||||
print('Waiting 10 seconds between downloads')
|
||||
#for _ in tqdm(range(10), unit = 's', unit_scale = True):
|
||||
# time.sleep(1)
|
||||
|
||||
|
||||
#print('Waiting 60 seconds between pages')
|
||||
#for _ in tqdm(range(60), unit = 's', unit_scale = True):
|
||||
# time.sleep(1)
|
||||
|
||||
with open("last_page", "w") as f:
|
||||
f.write(str(page_index))
|
||||
|
||||
Reference in New Issue
Block a user