Justin's Words

Python 爬图类整理版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from urllib import request
from urllib.parse import urlparse
import re
import os
import time

class fetchPics(request.FancyURLopener):
"""docstring for fetchPics"""

def __init__(self, config = {}):
super(request.FancyURLopener, self).__init__()

default = {
'url': 'http://www.868aa.com/Se/Se-3-1.html',
'p_title': '<title>(.+?)_',
'p_link': '\/Se\/\d+\.html',
'p_image': '(td|br)\>\<img\ssrc=\"(.+?)\"',
'save_dir': 'img',
'output': True
}
config.update(default)

self.__url = config['url']
self.__p_title = config['p_title']
self.__p_link = config['p_link']
self.__p_image = config['p_image']
self.__save_dir = config['save_dir']
self.__output = config['output']
self.__tday = time.strftime(r"%m_%d_%Y")
version = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36'

def isUrl(self):
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
if (regex.match(self.__url)):
return True
else:
return False

def check_dir(self):
if not os.path.exists(self.__save_dir):
os.makedirs(self.__save_dir)
print('Created directory ' + self.__save_dir)

def get_page(self):
if not self.isUrl():
print('Invalid url!')
return False
page = self.open(self.__url)
content = page.read().decode('gb18030')
return content

def get_links(self, save = True):
self.check_dir()
pattern = re.compile(self.__p_link)
links = pattern.findall(self.get_page())
new_links = []
file = open('img/' + self.__tday, 'w')

for link in links:
link = 'http://www.868aa.com' + link
new_links.append(link)
file.write(link + '\n')

file.close()
print('Links saved to img/' + self.__tday)
return new_links

def get_imgs(self):
self.check_dir()
print('Start to download images...')

for l in self.get_links():
page = self.open(l)
content = page.read().decode('gb18030')
p_title = re.compile(self.__p_title)
p_images = re.compile(self.__p_image)
title = p_title.findall(content)
images = p_images.findall(content)

print('Start to download ' + title[0] + '...')

i = 1
for img in images:
fileName, fileExtension = os.path.splitext(img[1])

if not os.path.exists('img/' + title[0]):
os.makedirs('img/' + title[0])
print('Created directory img/' + title[0])

saveName = 'img/' + title[0] + '/' + str(i) + fileExtension

if not os.path.exists(saveName):
self.retrieve(img[1], saveName)
print(saveName + ' downloaded')

i +=1

print(title[0] + 'downloaded')