爬取图虫网的美女图片脚本

非法程序、 · 发表于 2023-7-24 20:49

美女图片比较正经，脚本自动跳过只有一张图片的专题。爬下来的图片是.webp格式的，容量小。我看了一下，大概100页左右，爬完一共600多M
感觉质量不行。权当练手吧。

import os
import time
import requests
import re
from concurrent.futures import ThreadPoolExecutor
# 处理目录非法字符
def normalize_directory_name(name):
# 使用正则表达式替换非法字符为空格
cleaned_name = re.sub(r'[\\/:*?"<>|]', ' ', name)
# 去除多余的空格，并限制长度为100个字符
normalized_name = re.sub(r'\s+', ' ', cleaned_name).strip()[:100]
return normalized_name
def download_image(image_info):
user_id = image_info['user_id']
img_id_str = image_info['img_id_str']
img_url = f'https://photo.tuchong.com/{user_id}/f/{img_id_str}.webp'
image_data = requests.get(url=img_url, proxies=None).content
return img_id_str, image_data
for k in range(1, 99):
url = f'https://tuchong.com/rest/tags/%E7%BE%8E%E5%A5%B3/posts?page={k}&count=20&order=weekly&before_timestamp='
response = requests.get(url=url)
json_data = response.json()
post_list = json_data['postList']
with ThreadPoolExecutor() as executor:
futures = [executor.submit(download_image, image_info) for post in post_list for image_info in post.get('images', [])]
for future in futures:
img_id_str, image_data = future.result()
# 获取帖子标题
for post in post_list:
images = post.get('images', [])
if len(images) <= 1:
continue
if any(image_info['img_id_str'] == img_id_str for image_info in images):
title = post['title']
if not title: # 如果标题为空，则用用户ID替代
title = post['author_id']
title = normalize_directory_name(title)
image_dir = f'G:/tuchongspider/{title}'
if not os.path.exists(image_dir):
os.makedirs(image_dir)
image_path = f'{image_dir}/{title}-{img_id_str}.webp'
with open(image_path, 'wb') as f:
f.write(image_data)
print(f'Downloaded {img_id_str}.webp')
break

复制代码

[其他发布] 爬取图虫网的美女图片脚本