采集的站点:

免费代理IP http://ip.yqie.com/ipproxy.htm
66免费代理网 http://www.66ip.cn/
89免费代理 http://www.89ip.cn/
无忧代理 http://www.data5u.com/
云代理 http://www.ip3366.net/
快代理 https://www.kuaidaili.com/free/
极速专享代理 http://www.superfastip.com/
HTTP代理IP https://www.xicidaili.com/wt/
小舒代理 http://www.xsdaili.com
西拉免费代理IP http://www.xiladaili.com/
小幻HTTP代理 https://ip.ihuan.me/
全网代理IP http://www.goubanjia.com/
飞龙代理IP http://www.feilongip.com/

采集流程

第一步:获取页面内容

第二步:解析内容获取数据

第三步:数据格式转换

采集流程定制好了之后,把他创建为抽象类 让所有站点去继承它, 子类只需要去实现抽象方法。这是一个比较典型的模板模式

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from abc import ABC, abstractmethod
from typing import List
import requests
import bs4
from .model import ProxyModel


class AbsFreeProxyBase(ABC):
# 请求
http = requests

# 初始化
def __init__(self, url, code, **kwargs):
"""
:param url: 请求地址
:param code: 页面编码
:param kw: 附加信息
"""
self.url = url
self.code = code
self.kwargs = kwargs
self.beautifulsoup = bs4.BeautifulSoup

# 模板方法模式
# 第一步 获取页面内容 第二步 解析内容 第二步 格式化数据
def run(self) -> List[ProxyModel]:
text = self.get_page_text()
soup = self.beautifulsoup(text, 'lxml')
data = self.parse_text(soup)
return self.to_proxy(data)

# 获取页面内容
def get_page_text(self):
res = AbsFreeProxyBase.http.get(self.url, **self.kwargs)
if not res.ok:
res.raise_for_status()
return res.content.decode(self.code)

# 解析内容
@abstractmethod
def parse_text(self, soup: bs4.BeautifulSoup) -> List[list]:
pass

# 格式转换
@abstractmethod
def to_proxy(self, data:List[list]) -> List[ProxyModel]:
pass

如:快代理网站

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from .base import AbsFreeProxyBase
from typing import List
from .model import ProxyModel
import re

'''
快代理
https://www.kuaidaili.com/free
'''


class WWW_KUAIDAILI_COM(AbsFreeProxyBase):

# 初始化
def __init__(self, url, code='utf-8', **kwargs):
super().__init__(url, code, **kwargs)

# 解析内容
def parse_text(self, soup) -> List[list]:
"""
格式如下:
IP port(端口) 匿名度 类型(HTTP/https) 位置 响应速度 最后验证时间
"""
regex = re.compile(r'<td[^>]*>([^<>]+)</td>')
rows = soup.select('.table-bordered tr')
result = []
for row in [str(n) for n in rows]:
item = regex.findall(row)
item and result.append(item)
return result

# 格式转换
def to_proxy(self, data: List[list]) -> List[ProxyModel]:
result = []
for item in data:
result.append(ProxyModel(item[3], item[0], item[1], item[2]))
return result

应用实例

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from website import ProxyFactory
from browser.agent import useragent

factory = ProxyFactory()
headers = {
'user-agent': useragent.random()
}
'''
66免费代理网
www = factory.create('http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=',
'gbk',
headers=headers)
'''

'''
小幻HTTP代理
www = factory.create('https://ip.ihuan.me/',headers = headers)
'''

'''
89免费代理 http://www.89ip.cn/
www = factory.create('http://www.89ip.cn/',headers = headers)
'''

'''
无忧代理 http://www.data5u.com/
www = factory.create('http://www.data5u.com/',headers = headers)
'''


'''
http://www.goubanjia.com/
全网代理IP
www = factory.create('http://www.goubanjia.com/',headers = headers)
'''


'''
云代理 http://www.ip3366.net/
www = factory.create('http://www.ip3366.net/','gbk',headers = headers)
'''

'''
快代理
https://www.kuaidaili.com/free
'''
www = factory.create('https://www.kuaidaili.com/free',headers = headers)
data = www.run()

print(data)from website import ProxyFactory
from browser.agent import useragent

factory = ProxyFactory()
headers = {
'user-agent': useragent.random()
}
'''
66免费代理网
www = factory.create('http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=',
'gbk',
headers=headers)
'''

'''
小幻HTTP代理
www = factory.create('https://ip.ihuan.me/',headers = headers)
'''

'''
89免费代理 http://www.89ip.cn/
www = factory.create('http://www.89ip.cn/',headers = headers)
'''

'''
无忧代理 http://www.data5u.com/
www = factory.create('http://www.data5u.com/',headers = headers)
'''


'''
http://www.goubanjia.com/
全网代理IP
www = factory.create('http://www.goubanjia.com/',headers = headers)
'''


'''
云代理 http://www.ip3366.net/
www = factory.create('http://www.ip3366.net/','gbk',headers = headers)
'''

'''
快代理
https://www.kuaidaili.com/free
'''
www = factory.create('https://www.kuaidaili.com/free',headers = headers)
data = www.run()

print(data)