Python 爬图类整理版 Posted on 2014-11-17 | In Python | 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104from urllib import requestfrom urllib.parse import urlparseimport reimport osimport timeclass fetchPics(request.FancyURLopener): """docstring for fetchPics""" def __init__(self, config = {}): super(request.FancyURLopener, self).__init__() default = { 'url': 'http://www.868aa.com/Se/Se-3-1.html', 'p_title': '<title>(.+?)_', 'p_link': '\/Se\/\d+\.html', 'p_image': '(td|br)\>\<img\ssrc=\"(.+?)\"', 'save_dir': 'img', 'output': True } config.update(default) self.__url = config['url'] self.__p_title = config['p_title'] self.__p_link = config['p_link'] self.__p_image = config['p_image'] self.__save_dir = config['save_dir'] self.__output = config['output'] self.__tday = time.strftime(r"%m_%d_%Y") version = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36' def isUrl(self): regex = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'localhost|' #localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) if (regex.match(self.__url)): return True else: return False def check_dir(self): if not os.path.exists(self.__save_dir): os.makedirs(self.__save_dir) print('Created directory ' + self.__save_dir) def get_page(self): if not self.isUrl(): print('Invalid url!') return False page = self.open(self.__url) content = page.read().decode('gb18030') return content def get_links(self, save = True): self.check_dir() pattern = re.compile(self.__p_link) links = pattern.findall(self.get_page()) new_links = [] file = open('img/' + self.__tday, 'w') for link in links: link = 'http://www.868aa.com' + link new_links.append(link) file.write(link + '\n') file.close() print('Links saved to img/' + self.__tday) return new_links def get_imgs(self): self.check_dir() print('Start to download images...') for l in self.get_links(): page = self.open(l) content = page.read().decode('gb18030') p_title = re.compile(self.__p_title) p_images = re.compile(self.__p_image) title = p_title.findall(content) images = p_images.findall(content) print('Start to download ' + title[0] + '...') i = 1 for img in images: fileName, fileExtension = os.path.splitext(img[1]) if not os.path.exists('img/' + title[0]): os.makedirs('img/' + title[0]) print('Created directory img/' + title[0]) saveName = 'img/' + title[0] + '/' + str(i) + fileExtension if not os.path.exists(saveName): self.retrieve(img[1], saveName) print(saveName + ' downloaded') i +=1 print(title[0] + 'downloaded')