From 9fc426121a45cdfef96fade54daec4c11e2c3e5e Mon Sep 17 00:00:00 2001 From: levywang Date: Tue, 11 Mar 2025 15:23:08 +0800 Subject: [PATCH] feat(ptoxy):add proxy switch --- data/config.yaml | 1 + main.py | 1 + utils/hacg_spider.py | 93 -------------------------------------------- utils/spider.py | 6 +-- 4 files changed, 5 insertions(+), 96 deletions(-) delete mode 100644 utils/hacg_spider.py diff --git a/data/config.yaml b/data/config.yaml index 0f8639a..36f9826 100644 --- a/data/config.yaml +++ b/data/config.yaml @@ -12,6 +12,7 @@ files: av_spider: source_url: "https://missav.ai/cn/search/" proxy_url: "http://192.168.50.3:7890" # http or socks5 proxy + user_proxy: false hacg_spider: source_url: "https://www.hacg.mov/wp/" diff --git a/main.py b/main.py index 3846036..b1b0446 100644 --- a/main.py +++ b/main.py @@ -95,6 +95,7 @@ def main(cfg: DictConfig): crawler = AVSpider(av_code=code_str, source_url=cfg.av_spider.source_url, proxy_url=cfg.av_spider.proxy_url, + use_proxy=cfg.av_spider.use_proxy, cfg=cfg) video_links = crawler.get_video_url() all_magnet_links = [] diff --git a/utils/hacg_spider.py b/utils/hacg_spider.py deleted file mode 100644 index 4ea3be3..0000000 --- a/utils/hacg_spider.py +++ /dev/null @@ -1,93 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import re -import json -import os - -class HACGScraper: - def __init__(self, url, filepath): - self.url = url - self.filepath = filepath - - def get_pages(self): - response = requests.get(self.url) - html_content = response.text - - soup = BeautifulSoup(html_content, 'html.parser') - div_ele = soup.find('div', class_='wp-pagenavi') - page_text = div_ele.get_text() if div_ele else '' - - pages = None - if "共" in page_text: - pages = int(page_text.split('共')[1].split('页')[0]) - - return pages - - def get_links(self, page): - url = f'{self.url}' - response = requests.get(url) - html_content = response.text - - soup = BeautifulSoup(html_content, 'html.parser') - links = {} - for a_tag in soup.find_all('a'): - href = a_tag.get('href') - text = a_tag.get_text(strip=True) - if "月合集" in text: - links[text] = href - - magnet_links = {} - for title, link in links.items(): - response = requests.get(link) - - if response.status_code == 200: - content = response.text - matches = re.findall(r'\b[a-f0-9]{40}\b', content) - if matches: - magnet_links[title] = f'magnet:?xt=urn:btih:{matches[0]}' - else: - print(f"请求失败,状态码: {response.status_code}") - - return magnet_links - - def update_json_file(self): - if not os.path.exists(self.filepath) or os.path.getsize(self.filepath) == 0: - results = {} - total_pages = self.get_pages() - for i in range(1, total_pages + 1): - new_data = self.get_links(i) - results.update(new_data) - print(f'Page {i} processed (Full Update)') - else: - with open(self.filepath, 'r', encoding='utf-8') as file: - results = json.load(file) - - total_pages = self.get_pages() - for i in range(1, total_pages + 1): - new_data = self.get_links(i) - all_exists = True - - for title, magnet_link in new_data.items(): - if title not in results or results[title] != magnet_link: - all_exists = False - break - - if not all_exists: - results = {**new_data, **results} - print(f'Page {i} processed (Incremental Update)') - - if all_exists: - print(f"第 {i} 页数据已存在于 JSON 文件中,停止更新") - break - - with open(self.filepath, 'w', encoding='utf-8') as file: - json.dump(results, file, ensure_ascii=False, indent=4) - - print("JSON文件已更新") - -# 使用示例 -scraper = HACGScraper(url='https://www.hacg.mov/wp/page/1?s=%E5%90%88%E9%9B%86&submit=%E6%90%9C%E7%B4%A2', filepath=r"C:\Users\levywang\OneDrive\Code\avhub_v2\data\hacg.json") -scraper.update_json_file() - - - diff --git a/utils/spider.py b/utils/spider.py index 9d0f65e..1f262c4 100644 --- a/utils/spider.py +++ b/utils/spider.py @@ -8,10 +8,10 @@ from omegaconf import DictConfig from utils.logger import setup_logger class AVSpider: - def __init__(self, av_code, source_url, proxy_url, cfg: DictConfig): + def __init__(self, av_code, source_url, proxy_url, use_proxy, cfg: DictConfig): self.source_url = source_url self.av_code = av_code.lower() - self.proxy_url = proxy_url + self.proxy_url = proxy_url if use_proxy else None self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Content-Type': 'application/json' @@ -19,7 +19,7 @@ class AVSpider: self.proxies = { "http": self.proxy_url, "https": self.proxy_url - } + } if self.proxy_url else {} self.logger = setup_logger(cfg) def get_video_url(self) -> list: