mirror of
https://github.com/levywang/avhub.git
synced 2026-02-21 08:47:22 +08:00
feat(ptoxy):add proxy switch
This commit is contained in:
parent
89cc631947
commit
9fc426121a
@ -12,6 +12,7 @@ files:
|
||||
av_spider:
|
||||
source_url: "https://missav.ai/cn/search/"
|
||||
proxy_url: "http://192.168.50.3:7890" # http or socks5 proxy
|
||||
user_proxy: false
|
||||
|
||||
hacg_spider:
|
||||
source_url: "https://www.hacg.mov/wp/"
|
||||
|
||||
1
main.py
1
main.py
@ -95,6 +95,7 @@ def main(cfg: DictConfig):
|
||||
crawler = AVSpider(av_code=code_str,
|
||||
source_url=cfg.av_spider.source_url,
|
||||
proxy_url=cfg.av_spider.proxy_url,
|
||||
use_proxy=cfg.av_spider.use_proxy,
|
||||
cfg=cfg)
|
||||
video_links = crawler.get_video_url()
|
||||
all_magnet_links = []
|
||||
|
||||
@ -1,93 +0,0 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import re
|
||||
import json
|
||||
import os
|
||||
|
||||
class HACGScraper:
|
||||
def __init__(self, url, filepath):
|
||||
self.url = url
|
||||
self.filepath = filepath
|
||||
|
||||
def get_pages(self):
|
||||
response = requests.get(self.url)
|
||||
html_content = response.text
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
div_ele = soup.find('div', class_='wp-pagenavi')
|
||||
page_text = div_ele.get_text() if div_ele else ''
|
||||
|
||||
pages = None
|
||||
if "共" in page_text:
|
||||
pages = int(page_text.split('共')[1].split('页')[0])
|
||||
|
||||
return pages
|
||||
|
||||
def get_links(self, page):
|
||||
url = f'{self.url}'
|
||||
response = requests.get(url)
|
||||
html_content = response.text
|
||||
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
links = {}
|
||||
for a_tag in soup.find_all('a'):
|
||||
href = a_tag.get('href')
|
||||
text = a_tag.get_text(strip=True)
|
||||
if "月合集" in text:
|
||||
links[text] = href
|
||||
|
||||
magnet_links = {}
|
||||
for title, link in links.items():
|
||||
response = requests.get(link)
|
||||
|
||||
if response.status_code == 200:
|
||||
content = response.text
|
||||
matches = re.findall(r'\b[a-f0-9]{40}\b', content)
|
||||
if matches:
|
||||
magnet_links[title] = f'magnet:?xt=urn:btih:{matches[0]}'
|
||||
else:
|
||||
print(f"请求失败,状态码: {response.status_code}")
|
||||
|
||||
return magnet_links
|
||||
|
||||
def update_json_file(self):
|
||||
if not os.path.exists(self.filepath) or os.path.getsize(self.filepath) == 0:
|
||||
results = {}
|
||||
total_pages = self.get_pages()
|
||||
for i in range(1, total_pages + 1):
|
||||
new_data = self.get_links(i)
|
||||
results.update(new_data)
|
||||
print(f'Page {i} processed (Full Update)')
|
||||
else:
|
||||
with open(self.filepath, 'r', encoding='utf-8') as file:
|
||||
results = json.load(file)
|
||||
|
||||
total_pages = self.get_pages()
|
||||
for i in range(1, total_pages + 1):
|
||||
new_data = self.get_links(i)
|
||||
all_exists = True
|
||||
|
||||
for title, magnet_link in new_data.items():
|
||||
if title not in results or results[title] != magnet_link:
|
||||
all_exists = False
|
||||
break
|
||||
|
||||
if not all_exists:
|
||||
results = {**new_data, **results}
|
||||
print(f'Page {i} processed (Incremental Update)')
|
||||
|
||||
if all_exists:
|
||||
print(f"第 {i} 页数据已存在于 JSON 文件中,停止更新")
|
||||
break
|
||||
|
||||
with open(self.filepath, 'w', encoding='utf-8') as file:
|
||||
json.dump(results, file, ensure_ascii=False, indent=4)
|
||||
|
||||
print("JSON文件已更新")
|
||||
|
||||
# 使用示例
|
||||
scraper = HACGScraper(url='https://www.hacg.mov/wp/page/1?s=%E5%90%88%E9%9B%86&submit=%E6%90%9C%E7%B4%A2', filepath=r"C:\Users\levywang\OneDrive\Code\avhub_v2\data\hacg.json")
|
||||
scraper.update_json_file()
|
||||
|
||||
|
||||
|
||||
@ -8,10 +8,10 @@ from omegaconf import DictConfig
|
||||
from utils.logger import setup_logger
|
||||
|
||||
class AVSpider:
|
||||
def __init__(self, av_code, source_url, proxy_url, cfg: DictConfig):
|
||||
def __init__(self, av_code, source_url, proxy_url, use_proxy, cfg: DictConfig):
|
||||
self.source_url = source_url
|
||||
self.av_code = av_code.lower()
|
||||
self.proxy_url = proxy_url
|
||||
self.proxy_url = proxy_url if use_proxy else None
|
||||
self.headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
|
||||
'Content-Type': 'application/json'
|
||||
@ -19,7 +19,7 @@ class AVSpider:
|
||||
self.proxies = {
|
||||
"http": self.proxy_url,
|
||||
"https": self.proxy_url
|
||||
}
|
||||
} if self.proxy_url else {}
|
||||
self.logger = setup_logger(cfg)
|
||||
|
||||
def get_video_url(self) -> list:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user