Python 爬图类整理版

from urllib import request
from urllib.parse import urlparse
import re
import os
import time

class fetchPics(request.FancyURLopener):
    """docstring for fetchPics"""

    def __init__(self, config = {}):
        super(request.FancyURLopener, self).__init__()

        default = {
            'url': 'http://www.868aa.com/Se/Se-3-1.html',
            'p_title': '<title>(.+?)_',
            'p_link': '\/Se\/\d+\.html',
            'p_image': '(td|br)\>\<img\ssrc=\"(.+?)\"',
            'save_dir': 'img',
            'output': True
        }
        config.update(default)

        self.__url = config['url']
        self.__p_title = config['p_title']
        self.__p_link = config['p_link']
        self.__p_image = config['p_image']
        self.__save_dir = config['save_dir']
        self.__output = config['output']
        self.__tday = time.strftime(r"%m_%d_%Y")
        version = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36'

    def isUrl(self):
        regex = re.compile(
            r'^(?:http|ftp)s?://' # http:// or https://
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
            r'localhost|' #localhost...
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
            r'(?::\d+)?' # optional port
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
        if (regex.match(self.__url)):
            return True
        else:
            return False

    def check_dir(self):
        if not os.path.exists(self.__save_dir):
            os.makedirs(self.__save_dir)
            print('Created directory ' + self.__save_dir)

    def get_page(self):
        if not self.isUrl():
            print('Invalid url!')
            return False
        page = self.open(self.__url)
        content = page.read().decode('gb18030')
        return content

    def get_links(self, save = True):
        self.check_dir()
        pattern = re.compile(self.__p_link)
        links = pattern.findall(self.get_page())
        new_links = []
        file = open('img/' + self.__tday, 'w')

        for link in links:
            link = 'http://www.868aa.com' + link
            new_links.append(link)
            file.write(link + '\n')

        file.close()
        print('Links saved to img/' + self.__tday)
        return new_links

    def get_imgs(self):
        self.check_dir()
        print('Start to download images...')

        for l in self.get_links():
            page = self.open(l)
            content = page.read().decode('gb18030')
            p_title = re.compile(self.__p_title)
            p_images = re.compile(self.__p_image)
            title = p_title.findall(content)
            images = p_images.findall(content)

            print('Start to download ' + title[0] + '...')

            i = 1
            for img in images:
                fileName, fileExtension = os.path.splitext(img[1])

                if not os.path.exists('img/' + title[0]):
                    os.makedirs('img/' + title[0])
                    print('Created directory img/' + title[0])

                saveName = 'img/' + title[0] + '/' + str(i) + fileExtension

                if not os.path.exists(saveName):
                    self.retrieve(img[1], saveName)
                    print(saveName + ' downloaded')

                i +=1

            print(title[0] + 'downloaded')