-# AvHub - R18 Resource Search & Management Tool
+ # AvHub - R18 Resource Search & Management Tool
-**AvHub** is a web platform dedicated to the retrieval and management of adult video resources.
+ **AvHub** is a web platform dedicated to the retrieval and management of adult video resources.
Cloudflare Page: https://avhub.pages.dev/
@@ -55,7 +55,7 @@ python main.py
```
The default API address: `https://api.wwlww.org/`
-You can configure a reverse proxy and domain, replacing `BASE_URL` in line 52 of `web/script.js`.
+You can configure a reverse proxy and domain, replacing `BASE_URL` in line 38 of `web/script.js`.
The backend configuration file is located in `data/config.yaml`. Modify it according to your actual needs.
diff --git a/README_CN.md b/README_CN.md
index 7fcb3e8..aa464b5 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -2,9 +2,9 @@
-# AvHub - 成人影视资源管理平台
+# AvHub - R18 资源搜索和管理工具
-**AvHub** 是一款专注成人影视资源检索与管理的Web平台
+**AvHub** 是一个致力于检索和管理成人视频资源的 Web 平台
Cloudflare Page: https://avhub.pages.dev/
@@ -56,7 +56,7 @@ python main.py
```
默认运行的API地址:`http://127.0.0.1:8000/`
-可以配置反代和域名,替换 `web/script.js` 52行中的 `BASE_URL`
+可以配置反代和域名,替换 `web/script.js` 38行中的 `BASE_URL`
后端运行的配置文件在 `data/config.yaml` 中,请根据实际情况修改
diff --git a/data/config.yaml b/data/config.yaml
index 498eeb1..7482166 100644
--- a/data/config.yaml
+++ b/data/config.yaml
@@ -13,9 +13,11 @@ av_spider:
source_url: "https://missav.ai/cn/search/"
proxy_url: "http://192.168.50.3:7890" # http or socks5 proxy
use_proxy: false
+ use_cache: true # 是否启用缓存
+ cache_dir: "/app/data/.av" # 缓存目录路径
hacg_spider:
- source_url: "https://www.hacg.mov/wp/"
+ source_url: "https://www.hacg.mov"
logging:
log_file: "main.log"
diff --git a/main.py b/main.py
index 6d11537..b8c868a 100644
--- a/main.py
+++ b/main.py
@@ -14,18 +14,27 @@ import hydra
from utils.logger import setup_logger
import schedule
import time
+from contextlib import asynccontextmanager
+import pathlib
+import re
+from concurrent.futures import ThreadPoolExecutor
+import asyncio
@hydra.main(config_path='data/', config_name='config', version_base=None)
def main(cfg: DictConfig):
# 初始化日志记录器
+ global logger
logger = setup_logger(cfg)
- app = FastAPI()
+ @asynccontextmanager
+ async def lifespan(app: FastAPI):
+ # 启动前的操作
+ logger.info("Application startup")
+ yield
+ # 关闭时的操作
+ logger.info("Application shutdown")
- @app.on_event("startup")
- async def startup_event():
- global logger
- logger = setup_logger(cfg)
+ app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
@@ -35,49 +44,94 @@ def main(cfg: DictConfig):
allow_headers=cfg.app.cors_headers,
)
- def get_image_url(video_url: str) -> str:
+ # 创建线程池
+ executor = ThreadPoolExecutor(max_workers=10)
+
+ def _fetch_url(url: str) -> str:
+ """获取URL内容"""
+ try:
+ response = requests.get(url, timeout=10) # 减少超时时间到10秒
+ response.raise_for_status()
+ return response.text
+ except Exception as e:
+ logger.error(f"Failed to fetch URL {url}: {str(e)}")
+ return ""
+
+ def _parse_html(html_content: str, image_dir_url: str) -> list:
+ """解析HTML内容并提取链接"""
+ try:
+ soup = BeautifulSoup(html_content, 'html.parser')
+ a_tags = soup.find_all('a', href=True)
+ links = [image_dir_url + tag['href'] for tag in a_tags if tag['href'] != '../']
+ return [link for link in links if link.endswith('.webp')] or links
+ except Exception as e:
+ logger.error(f"Failed to parse HTML: {str(e)}")
+ return []
+
+ async def get_image_url(video_url: str) -> str:
+ """异步获取图片URL"""
try:
# 构建图片目录URL
image_dir_url = video_url.replace('index.m3u8', 'image/')
- # 发送请求获取目录内容
- response = requests.get(image_dir_url, timeout=20) # 设置超时时间防止长时间等待
- response.raise_for_status() # 如果响应状态码不是200,抛出HTTPError
+ # 设置超时时间为15秒的Future
+ loop = asyncio.get_event_loop()
+ html_content = await asyncio.wait_for(
+ loop.run_in_executor(executor, _fetch_url, image_dir_url),
+ timeout=15
+ )
+
+ if not html_content:
+ return None
- # 解析HTML并提取链接
- soup = BeautifulSoup(response.text, 'html.parser')
- a_tags = soup.find_all('a', href=True) # 只查找有href属性的标签
-
- # 分离出.webp和其他格式链接,并排除上级目录链接
- links = [image_dir_url + tag['href'] for tag in a_tags if tag['href'] != '../']
- webp_links = [link for link in links if link.endswith('.webp')]
-
- # 优先返回.webp链接,如果没有则从其他链接中随机返回
+ # HTML解析设置5秒超时
+ links = await asyncio.wait_for(
+ loop.run_in_executor(executor, _parse_html, html_content, image_dir_url),
+ timeout=5
+ )
+
if not links:
logger.warning("No image links found.")
return None
- return random.choice(webp_links or links)
+
+ return random.choice(links)
+ except asyncio.TimeoutError:
+ logger.error(f"Timeout while processing image URL for {video_url}")
+ return None
except Exception as e:
logger.error(f"Failed to obtain the image URL: {str(e)}")
return None
- def read_random_line(file_path: str) -> tuple[str, str]:
- """Reads a random line from a given file and returns video URL and image URL."""
+ async def read_random_line(file_path: str) -> tuple[str, str]:
+ """异步读取随机行并获取图片URL"""
if not os.path.isfile(file_path):
logger.error("File not found")
raise HTTPException(status_code=404, detail="File not found")
- with open(file_path, 'r') as file:
- lines = file.readlines()
+ try:
+ loop = asyncio.get_event_loop()
+ # 文件读取设置2秒超时
+ lines = await asyncio.wait_for(
+ loop.run_in_executor(executor, lambda: open(file_path, 'r').readlines()),
+ timeout=2
+ )
- if not lines:
- logger.error("File is empty")
- raise HTTPException(status_code=400, detail="File is empty")
+ if not lines:
+ logger.error("File is empty")
+ raise HTTPException(status_code=400, detail="File is empty")
- random_line = random.choice(lines).strip()
- img_url = get_image_url(random_line)
+ random_line = random.choice(lines).strip()
+ # 获取图片URL设置总超时20秒
+ img_url = await asyncio.wait_for(get_image_url(random_line), timeout=20)
- return random_line, img_url
+ return random_line, img_url
+ except asyncio.TimeoutError:
+ logger.error("Timeout while reading random line or fetching image URL")
+ # 如果超时,返回视频URL但不返回图片URL
+ return random.choice(lines).strip() if lines else None, None
+ except Exception as e:
+ logger.error(f"Error in read_random_line: {str(e)}")
+ raise HTTPException(status_code=500, detail=str(e))
@app.get("/v1/hacg")
async def read_hacg():
@@ -92,36 +146,80 @@ def main(cfg: DictConfig):
@app.get("/v1/avcode/{code_str}")
async def crawl_av(code_str: str):
+ # 规范化code_str,只保留字母和数字
+ code_str = re.sub(r'[^a-zA-Z0-9]', '', code_str).lower()
+
+ # 如果启用了缓存,确保缓存目录存在并尝试从缓存读取
+ if cfg.av_spider.use_cache:
+ # 确保缓存目录存在
+ pathlib.Path(cfg.av_spider.cache_dir).mkdir(parents=True, exist_ok=True)
+
+ cache_path = os.path.join(cfg.av_spider.cache_dir, f"{code_str}.json")
+ try:
+ if os.path.exists(cache_path):
+ with open(cache_path, 'r', encoding='utf-8') as f:
+ cached_data = json.load(f)
+ logger.info(f"Cache hit for AV code: {code_str}")
+ return {"status": "succeed", "data": cached_data}
+ except Exception as e:
+ logger.error(f"Error reading cache file: {str(e)}")
+
+ # 如果没有缓存或缓存读取失败,从网络获取
crawler = AVSpider(av_code=code_str,
- source_url=cfg.av_spider.source_url,
- proxy_url=cfg.av_spider.proxy_url,
- use_proxy=cfg.av_spider.use_proxy,
- cfg=cfg)
- video_links = crawler.get_video_url()
- all_magnet_links = []
+ source_url=cfg.av_spider.source_url,
+ proxy_url=cfg.av_spider.proxy_url,
+ use_proxy=cfg.av_spider.use_proxy,
+ cfg=cfg)
+
+ try:
+ magnet_links = await crawler.process_av_code()
+
+ if not magnet_links:
+ logger.error(f"No magnet links found for AV code: {code_str}")
+ raise HTTPException(status_code=404, detail="No magnet links found")
- for link in video_links:
- magnet_links = crawler.get_magnet_links(link)
- all_magnet_links.extend(magnet_links)
+ # 准备数据
+ magnet_data = [str(item) for item in magnet_links]
- if not all_magnet_links:
- logger.error("No magnet links found for AV code: %s", code_str)
- raise HTTPException(status_code=404, detail="No magnet links found")
+ # 如果启用了缓存,保存到缓存文件(只保存数据部分)
+ if cfg.av_spider.use_cache:
+ try:
+ with open(cache_path, 'w', encoding='utf-8') as f:
+ json.dump(magnet_data, f, ensure_ascii=False, indent=4)
+ logger.info(f"Cache written for AV code: {code_str}")
+ except Exception as e:
+ logger.error(f"Error writing cache file: {str(e)}")
- logger.info("Magnet links found for AV code: %s", code_str)
- return {"status": "succeed", "data": [str(item) for item in all_magnet_links]}
+ logger.info(f"Magnet links found for AV code: {code_str}")
+ return {"status": "succeed", "data": magnet_data}
+ except Exception as e:
+ logger.error(f"Error processing AV code {code_str}: {str(e)}")
+ raise HTTPException(status_code=500, detail=str(e))
+ finally:
+ del crawler # 确保资源被正确释放
@app.get("/v1/get_video")
async def get_random_video_url():
"""Returns a random video URL and its corresponding image URL."""
try:
file_path = cfg.files.video_urls_txt_path
- video_url, img_url = read_random_line(file_path)
+ # 设置整体操作超时为25秒
+ video_url, img_url = await asyncio.wait_for(
+ read_random_line(file_path),
+ timeout=25
+ )
+
+ if not video_url:
+ raise HTTPException(status_code=500, detail="Failed to get video URL")
+
logger.info("Random video URL and image URL fetched successfully")
return {
"url": video_url,
"img_url": img_url or ""
}
+ except asyncio.TimeoutError:
+ logger.error("Global timeout in get_random_video_url")
+ raise HTTPException(status_code=504, detail="Request timeout")
except Exception as e:
logger.error(f"Failed to fetch random video URL: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
diff --git a/requirements.txt b/requirements.txt
index 8dbd539..8cbf919 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,10 @@
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.13
+aiosignal==1.3.2
annotated-types==0.7.0
antlr4-python3-runtime==4.9.3
anyio==4.8.0
+attrs==25.3.0
beautifulsoup4==4.13.3
certifi==2025.1.31
cffi==1.17.1
@@ -8,11 +12,14 @@ charset-normalizer==3.4.1
click==8.1.8
curl_cffi==0.9.0
fastapi==0.115.11
+frozenlist==1.5.0
h11==0.14.0
hydra-core==1.3.2
idna==3.10
+multidict==6.1.0
omegaconf==2.3.0
packaging==24.2
+propcache==0.3.0
pycparser==2.22
pydantic==2.10.6
pydantic_core==2.27.2
@@ -24,4 +31,5 @@ soupsieve==2.6
starlette==0.46.1
typing_extensions==4.12.2
urllib3==2.3.0
-uvicorn==0.34.0
\ No newline at end of file
+uvicorn==0.34.0
+yarl==1.18.3
\ No newline at end of file
diff --git a/utils/spider.py b/utils/spider.py
index 1f262c4..bbc96be 100644
--- a/utils/spider.py
+++ b/utils/spider.py
@@ -2,10 +2,15 @@
import re
import json
import os
+import asyncio
+import aiohttp
+from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
from curl_cffi import requests
from omegaconf import DictConfig
from utils.logger import setup_logger
+from typing import List, Set, Dict, Any
+from aiohttp import ClientTimeout
class AVSpider:
def __init__(self, av_code, source_url, proxy_url, use_proxy, cfg: DictConfig):
@@ -13,21 +18,87 @@ class AVSpider:
self.av_code = av_code.lower()
self.proxy_url = proxy_url if use_proxy else None
self.headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
- 'Content-Type': 'application/json'
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Connection': 'keep-alive',
+ 'Upgrade-Insecure-Requests': '1',
+ 'Sec-Fetch-Dest': 'document',
+ 'Sec-Fetch-Mode': 'navigate',
+ 'Sec-Fetch-Site': 'none',
+ 'Sec-Fetch-User': '?1',
}
self.proxies = {
"http": self.proxy_url,
"https": self.proxy_url
} if self.proxy_url else {}
self.logger = setup_logger(cfg)
+ self.executor = ThreadPoolExecutor(max_workers=10)
- def get_video_url(self) -> list:
- """
- 获取视频页面的链接。
-
- :return: 包含视频页面链接的列表。
- """
+ def _fetch_url(self, url: str) -> str:
+ """使用curl_cffi获取URL内容"""
+ try:
+ response = requests.get(
+ url,
+ proxies=self.proxies,
+ headers=self.headers,
+ impersonate="chrome110",
+ timeout=30
+ )
+ response.raise_for_status()
+ return response.text
+ except Exception as e:
+ self.logger.error(f"Error fetching {url}: {str(e)}")
+ return ""
+
+ def _parse_video_page(self, html_content: str, code_str: str) -> Set[str]:
+ """在线程池中解析视频页面"""
+ try:
+ soup = BeautifulSoup(html_content, 'html.parser')
+ unique_links = set()
+ for a_tag in soup.find_all('a'):
+ alt_text = a_tag.get('alt')
+ if alt_text and code_str in alt_text:
+ href = a_tag.get('href')
+ if href:
+ unique_links.add(href)
+ return unique_links
+ except Exception as e:
+ self.logger.error(f"Error parsing video page: {str(e)}")
+ return set()
+
+ def _parse_magnet_page(self, html_content: str) -> List[List[str]]:
+ """在线程池中解析磁力链接页面"""
+ try:
+ soup = BeautifulSoup(html_content, 'html.parser')
+ target_table = soup.find('table', class_='min-w-full')
+ result = []
+
+ if target_table is not None:
+ rows = target_table.find_all('tr')
+ for row in rows:
+ cols = row.find_all('td')
+ data = []
+ for col in cols:
+ links = col.find_all('a', rel='nofollow')
+ if links:
+ for l in links:
+ href = l['href']
+ if "keepshare.org" not in href:
+ data.append(href)
+ text = col.get_text(strip=True)
+ if text != "下载" and "keepshare.org" not in text:
+ data.append(text)
+ if data:
+ result.append(data)
+ return result
+ except Exception as e:
+ self.logger.error(f"Error parsing magnet page: {str(e)}")
+ return []
+
+ async def get_video_url(self) -> List[str]:
+ """获取视频页面的链接"""
code_str = self.av_code.replace('-', '')
match = re.match(r'([a-zA-Z]+)(\d+)', code_str)
if not match:
@@ -37,73 +108,69 @@ class AVSpider:
letters, digits = match.groups()
code_str = f"{letters.lower()}-{digits}"
url = f"{self.source_url}{code_str}"
- try:
- response = requests.get(url, proxies=self.proxies, headers=self.headers)
- response.raise_for_status()
- except requests.RequestException as e:
- self.logger.error(f"Request Error: {e}")
+
+ # 在线程池中执行同步请求
+ loop = asyncio.get_event_loop()
+ html_content = await loop.run_in_executor(self.executor, self._fetch_url, url)
+
+ if not html_content:
return []
-
- html_content = response.text
-
- soup = BeautifulSoup(html_content, 'html.parser')
- unique_links = set()
-
- for a_tag in soup.find_all('a'):
- alt_text = a_tag.get('alt')
- if alt_text and code_str in alt_text:
- href = a_tag.get('href')
- if href:
- unique_links.add(href)
-
- self.logger.info(f"Found video URLs: {unique_links}")
-
+
+ # 在线程池中解析HTML
+ unique_links = await loop.run_in_executor(
+ self.executor,
+ self._parse_video_page,
+ html_content,
+ code_str
+ )
+
+ self.logger.info(f"Found {len(unique_links)} video URLs")
return list(unique_links)
- def get_magnet_links(self, link: str) -> list:
- """
- 从视频页面中提取磁力链接。
+ async def get_magnet_links(self, links: List[str]) -> List[List[str]]:
+ """获取所有磁力链接"""
+ loop = asyncio.get_event_loop()
+ tasks = []
- :param link: 视频页面的 URL。
- :return: 包含磁力链接的列表。
- """
+ # 创建所有获取页面内容的任务
+ for link in links:
+ task = loop.run_in_executor(self.executor, self._fetch_url, link)
+ tasks.append(task)
+
+ # 等待所有页面内容获取完成
+ html_contents = await asyncio.gather(*tasks)
+
+ # 在线程池中解析所有页面
+ parse_tasks = [
+ loop.run_in_executor(self.executor, self._parse_magnet_page, content)
+ for content in html_contents if content
+ ]
+ results = await asyncio.gather(*parse_tasks)
+
+ # 合并所有结果
+ all_results = []
+ for result in results:
+ all_results.extend(result)
+
+ self.logger.info(f"Found {len(all_results)} magnet links")
+ return all_results
+
+ async def process_av_code(self) -> List[List[str]]:
+ """处理整个AV代码的主方法"""
try:
- response = requests.get(link, proxies=self.proxies, headers=self.headers)
- response.raise_for_status()
- except requests.RequestException as e:
- self.logger.error(f"Request Error: {e}")
+ video_links = await self.get_video_url()
+ if not video_links:
+ return []
+
+ magnet_links = await self.get_magnet_links(video_links)
+ return magnet_links
+ except Exception as e:
+ self.logger.error(f"Error processing AV code {self.av_code}: {str(e)}")
return []
- html_content = response.text
-
- soup = BeautifulSoup(html_content, 'html.parser')
- target_table = soup.find('table', class_='min-w-full')
-
- result = []
- if target_table is not None:
- rows = target_table.find_all('tr')
- for row in rows:
- cols = row.find_all('td')
- data = []
-
- for col in cols:
- links = col.find_all('a', rel='nofollow')
- if links:
- for l in links:
- href = l['href']
- if "keepshare.org" not in href:
- data.append(href)
-
- text = col.get_text(strip=True)
- if text != "下载" and "keepshare.org" not in text:
- data.append(text)
-
- result.append(data)
-
- self.logger.info(f"Magnet links extracted from {link}")
-
- return result
-
+ def __del__(self):
+ """确保线程池被正确关闭"""
+ self.executor.shutdown(wait=False)
class HacgSpider:
def __init__(self, url, filepath, cfg: DictConfig):
@@ -134,7 +201,7 @@ class HacgSpider:
return pages
def get_links(self, page):
- url = f'{self.url}page/{page}?s=%E5%90%88%E9%9B%86&submit=%E6%90%9C%E7%B4%A2'
+ url = f'{self.url}/wp/page/{page}?s=%E5%90%88%E9%9B%86&submit=%E6%90%9C%E7%B4%A2'
try:
response = requests.get(url)
response.raise_for_status()
diff --git a/web/index.html b/web/index.html
index c41a9da..79891bc 100644
--- a/web/index.html
+++ b/web/index.html
@@ -98,11 +98,11 @@
显示封面
-