批量采集图片源码
188*****018 发布于2020-01 浏览:2738 回复:0
0
收藏

大家在学习一段时间后可以尝试自建数据集,本源码可从百度图片网站根据关键字和数量批量采集图片(也可从源码内更换采集的目标网址)。功能已在源码内注释,可帮助大家提高采集图片数据的效率。

import re
import uuid
import requests
import os


class DownloadImages:
    def __init__(self, download_max, all_class, key_word):
        self.download_sum = 0
        self.download_max = int(download_max  )   #设置每个关键词图片的下载数量
        self.key_word = key_word
        self.save_path = '../images/%s/%s' % (all_class, key_word)

    def start_download(self):
        self.download_sum = 0
        gsm = 80
        str_gsm = str(gsm)
        pn = 0
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        while self.download_sum < self.download_max:
            str_pn = str(self.download_sum)
            #爬取的目标网址,注意去掉\后的空格字符
            url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&' \
                  'word=' + self.key_word + '&pn=' + str_pn + '&gsm=' + str_gsm + \
                  '&ct=&ic=0&lm=-1&width=0&height=0'
            print (url)
            result = requests.get(url)
            self.downloadImages(result.text)
        print ("下载完成")

    def downloadImages(self, html):
        img_urls = re.findall('"objURL":"(.*?)",', html, re.S)
        print ('找到关键词:' + self.key_word + '的图片,现在开始下载图片...')
        for img_url in img_urls:
            print ('正在下载第' + str(self.download_sum + 1) + '张图片,图片地址:' + str(img_url))
            try:
                pic = requests.get(img_url, timeout=50)
                pic_name = self.save_path + '/' + str(uuid.uuid1()) + '.' + str(img_url).split('.')[-1]
                with open(pic_name, 'wb') as f:
                    f.write(pic.content)
                self.download_sum += 1
                if self.download_sum >= self.download_max:
                    break
            except  :
                print ('【错误】当前图片无法下载,%s' )
                continue


if __name__ == '__main__':
    all_class = input   ('请输入你要下载总类别名称:')
    key_word_max = input('请输入你要下载几个类别:')
    key_words = [ ]
    for sum in range (int(key_word_max)):     #设置关键词数量
        key_words.append(input('请输入第%s个关键字:' % str(sum + 1)))
    max_sum = input('请输入每个类别下载的数量:')
    for key_word in key_words:
        downloadImages = DownloadImages(max_sum, all_class, key_word)
        downloadImages.start_download()
收藏
点赞
0
个赞
TOP
切换版块