Initial Commit

This commit is contained in:
Daniel Cortes
2020-05-22 01:34:27 -04:00
commit e397cf4c58
2 changed files with 57 additions and 0 deletions

1
last_page Executable file
View File

@@ -0,0 +1 @@
74

56
scrap.py Executable file
View File

@@ -0,0 +1,56 @@
#!/usr/bin/python3.7
import os
import re
import time
import math
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
base_url = 'https://modarchive.org/index.php?query=54&request=search&search_type=genre&page={}'
headers = {'User-Agent' : 'I want music, im sorry :c, I will do it slowly'}
with open("last_page", "r") as f:
last_page = int(f.read())
for page_index in range(last_page + 1, 75):
url = base_url.format(page_index)
print('Requesting url {}'.format(url))
request = requests.get(url, headers = headers)
page = request.content
soup = bs(page, 'html.parser')
links = soup.find_all('a', title = 'Download')
print('Found {} links to download'.format(len(links)))
for i, link in enumerate(links, start=1):
remote_file = requests.get(link['href'], stream=True, headers = headers)
content_disposition= remote_file.headers['content-disposition']
filename = re.findall("filename=(.+)", content_disposition)[0]
if os.path.isfile('mods/' + filename) :
print('{} already exists, skiping'.format(filename))
continue
file_size = int(remote_file.headers['content-length'])
block_size = 1024
wrote = 0
with open('mods/' + filename, 'wb') as handle:
for data in tqdm(remote_file.iter_content(block_size), desc='{} -> {}'.format(i, link['href']), total = math.ceil(file_size//block_size), unit = 'KB', unit_scale = True, miniters = 0, mininterval = 0):
handle.write(data)
print('Waiting 10 seconds between downloads')
#for _ in tqdm(range(10), unit = 's', unit_scale = True):
# time.sleep(1)
#print('Waiting 60 seconds between pages')
#for _ in tqdm(range(60), unit = 's', unit_scale = True):
# time.sleep(1)
with open("last_page", "w") as f:
f.write(str(page_index))