Supercharging superbooga (#3272)
This commit is contained in:
parent
ad00b8eb26
commit
0845724a89
21 changed files with 12294 additions and 2 deletions
65
extensions/superboogav2/download_urls.py
Normal file
65
extensions/superboogav2/download_urls.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
import concurrent.futures
|
||||
import requests
|
||||
import re
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
import extensions.superboogav2.parameters as parameters
|
||||
|
||||
from .data_processor import process_and_add_to_collector
|
||||
from .utils import create_metadata_source
|
||||
|
||||
def _download_single(url):
|
||||
response = requests.get(url, timeout=5)
|
||||
if response.status_code == 200:
|
||||
return response.content
|
||||
else:
|
||||
raise Exception("Failed to download URL")
|
||||
|
||||
|
||||
def _download_urls(urls, threads=1):
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
|
||||
futures = []
|
||||
for url in urls:
|
||||
future = executor.submit(_download_single, url)
|
||||
futures.append(future)
|
||||
|
||||
results = []
|
||||
i = 0
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
try:
|
||||
result = future.result()
|
||||
results.append(result)
|
||||
i += 1
|
||||
yield f"{i}/{len(urls)}", results
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
yield "Done", results
|
||||
|
||||
|
||||
def feed_url_into_collector(urls, collector):
|
||||
all_text = ''
|
||||
cumulative = ''
|
||||
|
||||
urls = urls.strip().split('\n')
|
||||
cumulative += f'Loading {len(urls)} URLs with {parameters.get_num_threads()} threads...\n\n'
|
||||
yield cumulative
|
||||
for update, contents in _download_urls(urls, threads=parameters.get_num_threads()):
|
||||
yield cumulative + update
|
||||
|
||||
cumulative += 'Processing the HTML sources...'
|
||||
yield cumulative
|
||||
for content in contents:
|
||||
soup = BeautifulSoup(content, features="lxml")
|
||||
for script in soup(["script", "style"]):
|
||||
script.extract()
|
||||
|
||||
strings = soup.stripped_strings
|
||||
if parameters.get_is_strong_cleanup():
|
||||
strings = [s for s in strings if re.search("[A-Za-z] ", s)]
|
||||
|
||||
text = '\n'.join([s.strip() for s in strings])
|
||||
all_text += text
|
||||
|
||||
process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download'))
|
||||
Loading…
Add table
Add a link
Reference in a new issue