From 9fc426121a45cdfef96fade54daec4c11e2c3e5e Mon Sep 17 00:00:00 2001
From: levywang <levywang@x-epic.com>
Date: Tue, 11 Mar 2025 15:23:08 +0800
Subject: [PATCH] feat(ptoxy):add proxy switch

---
 data/config.yaml     |  1 +
 main.py              |  1 +
 utils/hacg_spider.py | 93 --------------------------------------------
 utils/spider.py      |  6 +--
 4 files changed, 5 insertions(+), 96 deletions(-)
 delete mode 100644 utils/hacg_spider.py

diff --git a/data/config.yaml b/data/config.yaml
index 0f8639a..36f9826 100644
--- a/data/config.yaml
+++ b/data/config.yaml
@@ -12,6 +12,7 @@ files:
 av_spider:
   source_url: "https://missav.ai/cn/search/"
   proxy_url: "http://192.168.50.3:7890" # http or socks5 proxy
+  user_proxy: false
 
 hacg_spider:
   source_url: "https://www.hacg.mov/wp/"
diff --git a/main.py b/main.py
index 3846036..b1b0446 100644
--- a/main.py
+++ b/main.py
@@ -95,6 +95,7 @@ def main(cfg: DictConfig):
         crawler = AVSpider(av_code=code_str, 
                            source_url=cfg.av_spider.source_url, 
                            proxy_url=cfg.av_spider.proxy_url,
+                           use_proxy=cfg.av_spider.use_proxy,
                            cfg=cfg)
         video_links = crawler.get_video_url()
         all_magnet_links = []
diff --git a/utils/hacg_spider.py b/utils/hacg_spider.py
deleted file mode 100644
index 4ea3be3..0000000
--- a/utils/hacg_spider.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import requests
-from bs4 import BeautifulSoup
-import re
-import json
-import os
-
-class HACGScraper:
-    def __init__(self, url, filepath):
-        self.url = url
-        self.filepath = filepath
-
-    def get_pages(self):
-        response = requests.get(self.url)
-        html_content = response.text
-
-        soup = BeautifulSoup(html_content, 'html.parser')
-        div_ele = soup.find('div', class_='wp-pagenavi')
-        page_text = div_ele.get_text() if div_ele else ''
-
-        pages = None
-        if "共" in page_text:
-            pages = int(page_text.split('共')[1].split('页')[0])
-
-        return pages
-
-    def get_links(self, page):
-        url = f'{self.url}'
-        response = requests.get(url)
-        html_content = response.text
-
-        soup = BeautifulSoup(html_content, 'html.parser')
-        links = {}
-        for a_tag in soup.find_all('a'):
-            href = a_tag.get('href')
-            text = a_tag.get_text(strip=True)
-            if "月合集" in text:
-                links[text] = href
-
-        magnet_links = {}
-        for title, link in links.items():
-            response = requests.get(link)
-
-            if response.status_code == 200:
-                content = response.text
-                matches = re.findall(r'\b[a-f0-9]{40}\b', content)
-                if matches:
-                    magnet_links[title] = f'magnet:?xt=urn:btih:{matches[0]}'
-            else:
-                print(f"请求失败，状态码: {response.status_code}")
-
-        return magnet_links
-
-    def update_json_file(self):
-        if not os.path.exists(self.filepath) or os.path.getsize(self.filepath) == 0:
-            results = {}
-            total_pages = self.get_pages()
-            for i in range(1, total_pages + 1):
-                new_data = self.get_links(i)
-                results.update(new_data)
-                print(f'Page {i} processed (Full Update)')
-        else:
-            with open(self.filepath, 'r', encoding='utf-8') as file:
-                results = json.load(file)
-
-            total_pages = self.get_pages()
-            for i in range(1, total_pages + 1):
-                new_data = self.get_links(i)
-                all_exists = True
-
-                for title, magnet_link in new_data.items():
-                    if title not in results or results[title] != magnet_link:
-                        all_exists = False
-                        break
-
-                if not all_exists:
-                    results = {**new_data, **results}
-                    print(f'Page {i} processed (Incremental Update)')
-
-                if all_exists:
-                    print(f"第 {i} 页数据已存在于 JSON 文件中，停止更新")
-                    break
-
-        with open(self.filepath, 'w', encoding='utf-8') as file:
-            json.dump(results, file, ensure_ascii=False, indent=4)
-
-        print("JSON文件已更新")
-
-# 使用示例
-scraper = HACGScraper(url='https://www.hacg.mov/wp/page/1?s=%E5%90%88%E9%9B%86&submit=%E6%90%9C%E7%B4%A2', filepath=r"C:\Users\levywang\OneDrive\Code\avhub_v2\data\hacg.json")
-scraper.update_json_file()
-
-
-
diff --git a/utils/spider.py b/utils/spider.py
index 9d0f65e..1f262c4 100644
--- a/utils/spider.py
+++ b/utils/spider.py
@@ -8,10 +8,10 @@ from omegaconf import DictConfig
 from utils.logger import setup_logger
 
 class AVSpider:
-    def __init__(self, av_code, source_url, proxy_url, cfg: DictConfig):
+    def __init__(self, av_code, source_url, proxy_url, use_proxy, cfg: DictConfig):
         self.source_url = source_url
         self.av_code = av_code.lower()
-        self.proxy_url = proxy_url
+        self.proxy_url = proxy_url if use_proxy else None
         self.headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
             'Content-Type': 'application/json'
@@ -19,7 +19,7 @@ class AVSpider:
         self.proxies = {
             "http": self.proxy_url,
             "https": self.proxy_url
-        }
+        } if self.proxy_url else {}
         self.logger = setup_logger(cfg)
 
     def get_video_url(self) -> list: