Initial Commit
This commit is contained in:
56
scrap.py
Executable file
56
scrap.py
Executable file
@@ -0,0 +1,56 @@
|
|||||||
|
#!/usr/bin/python3.7
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
import math
|
||||||
|
import requests
|
||||||
|
from tqdm import tqdm
|
||||||
|
from bs4 import BeautifulSoup as bs
|
||||||
|
|
||||||
|
base_url = 'https://modarchive.org/index.php?query=54&request=search&search_type=genre&page={}'
|
||||||
|
headers = {'User-Agent' : 'I want music, im sorry :c, I will do it slowly'}
|
||||||
|
|
||||||
|
with open("last_page", "r") as f:
|
||||||
|
last_page = int(f.read())
|
||||||
|
|
||||||
|
for page_index in range(last_page + 1, 75):
|
||||||
|
url = base_url.format(page_index)
|
||||||
|
print('Requesting url {}'.format(url))
|
||||||
|
|
||||||
|
request = requests.get(url, headers = headers)
|
||||||
|
page = request.content
|
||||||
|
|
||||||
|
soup = bs(page, 'html.parser')
|
||||||
|
links = soup.find_all('a', title = 'Download')
|
||||||
|
print('Found {} links to download'.format(len(links)))
|
||||||
|
|
||||||
|
for i, link in enumerate(links, start=1):
|
||||||
|
remote_file = requests.get(link['href'], stream=True, headers = headers)
|
||||||
|
content_disposition= remote_file.headers['content-disposition']
|
||||||
|
filename = re.findall("filename=(.+)", content_disposition)[0]
|
||||||
|
|
||||||
|
if os.path.isfile('mods/' + filename) :
|
||||||
|
print('{} already exists, skiping'.format(filename))
|
||||||
|
continue
|
||||||
|
|
||||||
|
file_size = int(remote_file.headers['content-length'])
|
||||||
|
block_size = 1024
|
||||||
|
wrote = 0
|
||||||
|
|
||||||
|
|
||||||
|
with open('mods/' + filename, 'wb') as handle:
|
||||||
|
for data in tqdm(remote_file.iter_content(block_size), desc='{} -> {}'.format(i, link['href']), total = math.ceil(file_size//block_size), unit = 'KB', unit_scale = True, miniters = 0, mininterval = 0):
|
||||||
|
handle.write(data)
|
||||||
|
|
||||||
|
print('Waiting 10 seconds between downloads')
|
||||||
|
#for _ in tqdm(range(10), unit = 's', unit_scale = True):
|
||||||
|
# time.sleep(1)
|
||||||
|
|
||||||
|
|
||||||
|
#print('Waiting 60 seconds between pages')
|
||||||
|
#for _ in tqdm(range(60), unit = 's', unit_scale = True):
|
||||||
|
# time.sleep(1)
|
||||||
|
|
||||||
|
with open("last_page", "w") as f:
|
||||||
|
f.write(str(page_index))
|
||||||
|
|
||||||
Reference in New Issue
Block a user