|
美女图片比较正经,脚本自动跳过只有一张图片的专题。爬下来的图片是.webp格式的,容量小。我看了一下,大概100页左右,爬完一共600多M
感觉质量不行。权当练手吧。
- import os
- import time
- import requests
- import re
- from concurrent.futures import ThreadPoolExecutor
-
- # 处理目录非法字符
- def normalize_directory_name(name):
- # 使用正则表达式替换非法字符为空格
- cleaned_name = re.sub(r'[\\/:*?"<>|]', ' ', name)
- # 去除多余的空格,并限制长度为100个字符
- normalized_name = re.sub(r'\s+', ' ', cleaned_name).strip()[:100]
- return normalized_name
-
- def download_image(image_info):
- user_id = image_info['user_id']
- img_id_str = image_info['img_id_str']
- img_url = f'https://photo.tuchong.com/{user_id}/f/{img_id_str}.webp'
- image_data = requests.get(url=img_url, proxies=None).content
- return img_id_str, image_data
-
- for k in range(1, 99):
- url = f'https://tuchong.com/rest/tags/%E7%BE%8E%E5%A5%B3/posts?page={k}&count=20&order=weekly&before_timestamp='
-
- response = requests.get(url=url)
- json_data = response.json()
-
- post_list = json_data['postList']
-
- with ThreadPoolExecutor() as executor:
- futures = [executor.submit(download_image, image_info) for post in post_list for image_info in post.get('images', [])]
-
- for future in futures:
- img_id_str, image_data = future.result()
-
- # 获取帖子标题
- for post in post_list:
- images = post.get('images', [])
- if len(images) <= 1:
- continue
- if any(image_info['img_id_str'] == img_id_str for image_info in images):
- title = post['title']
- if not title: # 如果标题为空,则用用户ID替代
- title = post['author_id']
- title = normalize_directory_name(title)
- image_dir = f'G:/tuchongspider/{title}'
- if not os.path.exists(image_dir):
- os.makedirs(image_dir)
-
- image_path = f'{image_dir}/{title}-{img_id_str}.webp'
- with open(image_path, 'wb') as f:
- f.write(image_data)
-
- print(f'Downloaded {img_id_str}.webp')
- break
复制代码
|
|