目标网址:aHR0cHM6Ly93d3cubGVpc2ltYW8uY29tLw==
直接上代码
这个网站请求频繁之后会阻止你的请求,所以加了一些防护措施,会有一些延迟,并且没有破解会员的限制,只能下载前20张好像。
仅供学习,请勿用于非法用途!
import requests from lxml import etree from urllib.parse import urljoin from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry import re def get_main_page(): num = int(input("请输入需要爬取蕾丝猫的页码:")) header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/125.0.0.0 Safari/537.36", "Referer": "https://www.lesmao.com/" } url = f"https://www.leisimao.com/page/{num}" session = requests.Session() retry = Retry( total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504] ) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) try: resp = session.get(url, headers=header, timeout=10) # 设置超时时间 resp.raise_for_status() except requests.exceptions.RequestException as e: print(f"请求出错: {e}") return main_page = etree.HTML(resp.text) main_url = main_page.xpath("//div[@class='photo group']/a/@href") for img_url in main_url: href = urljoin(url, img_url) # print(href) get_json_data(url, href) def get_json_data(url, href): # print(href) header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/125.0.0.0 Safari/537.36", "Referer": url } session = requests.Session() retry = Retry( total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504] ) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) try: resp = session.get(href, headers=header, timeout=10) # 设置超时时间 resp.raise_for_status() except requests.exceptions.RequestException as e: print(f"请求出错: {e}") return # 解析页面内容,查找包含 JSON 数据的 script 标签 page = etree.HTML(resp.text) script_content = page.xpath("//script[@id='__NUXT_DATA__']/text()") if script_content: json_data = script_content[0] parse_json_data(json_data) def parse_json_data(json_data): header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/125.0.0.0 Safari/537.36", "Referer": "https://www.lesmao.com/" } # 使用正则表达式找到最后的几个 JPG 链接 pattern = r'"T/XiuRen/.*?\.jpg"' jpg_links = re.findall(pattern, json_data) # 打印最后的 JPG 链接 for link in jpg_links: link = link.strip('"') # 去除双引号 href = urljoin("https://img.lesmao.vip/1178/", link) href = href.replace(".jpg", ".webp") # 替换为 .webp 格式 # print(href) if href: print(f"正在下载图片: {href}") img_resp = requests.get(href, headers=header) file_name = href.split("/")[-1] with open(f"img/{file_name}", "wb") as f: f.write(img_resp.content) else: print(f"未找到图片的 src 属性,URL: {href}") if __name__ == '__main__': get_main_page()
© 版权声明
暂无评论内容