爬取代理ip
2018/2/4 20:00:59
以 国内高匿代理IP 为我们的代理ip来源,用以下代码获取页面上的代理ip。
#-*-coding:utf-8 -*-
import urllib2
import urllib
import re
def get_ips(url):
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}
req = urllib2.Request(url=url, headers=header)
r = urllib2.urlopen(req)
the_page = r.read()
ipadress = re.findall('(?<=<td>)\d+.\d+.\d+.\d+(?=</td>)', the_page)
port = re.findall('(?<=<td>)\d+(?=</td>)', the_page)
for i in range(len(ipadress)):
ipadress[i] += ":" + port[i]
print ipadress
f = open("ipse.txt", "a+")
for ip in ipadress:
f.write(ip)
f.write("\r\n")
f.close()
f = open("ipse.txt", "w+")
f.write("")
f.close()
# get_ips("http://www.ip181.com/")
for ad in range(1,4000): #从第一页爬到4000页,4000为你需要爬的页数
url="http://www.xicidaili.com/nn/"+str(ad)
print url
get_ips(url)