目标网址:aHR0cHM6Ly93d3cubGVpc2ltYW8uY29tLw==
直接上代码
这个网站请求频繁之后会阻止你的请求,所以加了一些防护措施,会有一些延迟,并且没有破解会员的限制,只能下载前20张好像。
仅供学习,请勿用于非法用途!
import requests
from lxml import etree
from urllib.parse import urljoin
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import re
def get_main_page():
num = int(input("请输入需要爬取蕾丝猫的页码:"))
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/125.0.0.0 Safari/537.36",
"Referer": "https://www.lesmao.com/"
}
url = f"https://www.leisimao.com/page/{num}"
session = requests.Session()
retry = Retry(
total=5,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
try:
resp = session.get(url, headers=header, timeout=10) # 设置超时时间
resp.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"请求出错: {e}")
return
main_page = etree.HTML(resp.text)
main_url = main_page.xpath("//div[@class='photo group']/a/@href")
for img_url in main_url:
href = urljoin(url, img_url)
# print(href)
get_json_data(url, href)
def get_json_data(url, href):
# print(href)
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/125.0.0.0 Safari/537.36",
"Referer": url
}
session = requests.Session()
retry = Retry(
total=5,
backoff_factor=1,
status_forcelist=[500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
try:
resp = session.get(href, headers=header, timeout=10) # 设置超时时间
resp.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"请求出错: {e}")
return
# 解析页面内容,查找包含 JSON 数据的 script 标签
page = etree.HTML(resp.text)
script_content = page.xpath("//script[@id='__NUXT_DATA__']/text()")
if script_content:
json_data = script_content[0]
parse_json_data(json_data)
def parse_json_data(json_data):
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/125.0.0.0 Safari/537.36",
"Referer": "https://www.lesmao.com/"
}
# 使用正则表达式找到最后的几个 JPG 链接
pattern = r'"T/XiuRen/.*?\.jpg"'
jpg_links = re.findall(pattern, json_data)
# 打印最后的 JPG 链接
for link in jpg_links:
link = link.strip('"') # 去除双引号
href = urljoin("https://img.lesmao.vip/1178/", link)
href = href.replace(".jpg", ".webp") # 替换为 .webp 格式
# print(href)
if href:
print(f"正在下载图片: {href}")
img_resp = requests.get(href, headers=header)
file_name = href.split("/")[-1]
with open(f"img/{file_name}", "wb") as f:
f.write(img_resp.content)
else:
print(f"未找到图片的 src 属性,URL: {href}")
if __name__ == '__main__':
get_main_page()
© 版权声明









暂无评论内容