第一次上传

This commit is contained in:
xishandong 2023-07-19 11:01:44 +08:00
commit 2c9d84a149
173 changed files with 25224 additions and 0 deletions

94
README.md Normal file
View File

@ -0,0 +1,94 @@
# 爬虫项目实战
## 说明
所有项目均为作者练手分享项目,如遇侵权请联系删除,仅作学习分享,不能进行任何商业活动。
由于程序完成的时间问题,部分项目可能无法复用。
练习笔记见note.txt
此项目将持续更新
## 基础篇
### request篇
1. 第一个爬虫程序,百度网页
2. 初始反爬-ua
3. 认识post请求-- 百度翻译
4. 豆瓣电影
5. 肯德基位置查询
### 解析html以及正则篇
1. 获取fakeua -- lxml解析
2. 4k图片爬取 -- lxml以及解决编码错误问题
3. 58 -- lxml以及分页爬取
4. bs案例
5. bs基础
6. xpath解析
7. xpath基础
8. 正则练习
9. 正则基础
10. 简历爬取
### selenium
1. 12306模拟登录
2. damai网
3. 基础自动操作
4. 模拟登陆
5. 动作链和ifream处理
6. 无头浏览器和反检测
### scrapy篇
1. bossjob一级页面爬取
2. 双色球
3. 图片
4. 阳光政策
5. yi车数据爬取 -- 带有js逆向不过是入门级以及大批量json数据解析
6. 校花网
7. 网易新闻
8. 17k小说爬取
### 高性能异步爬虫
1. 认识flask
2. meinv图片批量爬取
3. 明星图片爬取
4. 多任务协程
5. 线程池基础
6. 线程池应用
### 综合案例
1. 某视频网站 --> m3u8视频下载解决带密钥以及不带密钥情况m3u8入门级别以及多线程下载
2. ins爬虫对于页面参数提取以及解析json文件
3. 语言爬虫,利用网络将文本转为语言,支持中英韩三国语言
4. 验证码相关 -- 某诗文网登录以及图片验证码解决 --- ddddocr
## 进阶篇
### js逆向专题
***
#### 请求头或响应数据加密
1. 某天气网站---> 动态js 动态key 动态参数 反debug
2. 某足球网站 --> 请求体多重加密,加密位置难定位
3. youdao翻译
4. fjs公共交易 --> 混淆参数加密
5. wangyiyun音乐 --> 实现全站数据爬取
6. 娱乐指数 --> 基础入门
#### 环境检测
#### wasm加密
1. 某航空 --> wasm操作内容实现加密解密 阿里系v2检测 阿里系v3检测(待解决)

239
js/wasm/air/Flight.py Normal file
View File

@ -0,0 +1,239 @@
import json
import re
import time
from urllib.parse import quote
import execjs
import prettytable as pt
import requests
class CEAir:
def __init__(self):
self.headers = {
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Origin': 'https://m.ceair.com',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
}
self.cookies = {
'_xid': 'CVUbYl9lz3HFU2na2mZviTHeQM%2BaLh%2FhZbEQ2Axq1MA%3D',
'_fmdata': 'uD5xda4HKJuu34L%2BVFA7yz9OQ7lR4yI6hmuL2aRyRiYEBkNHudAH0OBn7047MefSAP4CBQbxadfirurKjXlEhA%3D%3D',
'acw_tc': 'ac11000116822592388965513e00ce89a852f5d79237d70f416ffeb9d66973'
}
self.session = requests.Session()
self.flag = 1
def ajax_request(self, *, url, json_data) -> json:
"""
# 发送请求
:param url: api接口
:param json_data: 表单信息
:return: json数据
"""
resp = self.session.post(url=url, json=json_data, headers=self.headers, cookies=self.cookies)
try:
data = resp.json()['res']
ctx = execjs.compile(open('./demo.js', 'r', encoding='utf-8').read()).call('decrypto', data)
return json.loads(ctx)
except requests.exceptions.JSONDecodeError:
# 处理acw_sc
self.cookie_update(resp.text)
print('cookies更新完成')
return self.ajax_request(url=url, json_data=json_data)
def get_flight(self, arr, dep, date):
"""
:param arr: 到达城市
:param dep: 出发城市
:param date: 出发时间
:return: 返回航班信息
"""
data = {
"tripType": 0, "depCode": self.get_city_code(dep), "arrCode": self.get_city_code(arr), "dt": "1", "at": "1",
"depN": dep, "arrN": arr,
"flightDate": date, "carryChd": False, "carryInf": False, "productType": "CASH", "curIndex": 0
}
url = quote(json.dumps(data, separators=(',', ':')))
url = 'https://m.ceair.com/mapp/reserve/flightList?newParam=' + url
self.headers.update({
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/json',
'M-CEAIR-ENCRYPTED': 'true',
'Origin': 'https://m.ceair.com',
'Pragma': 'no-cache',
'Referer': url,
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
'X-CEAIR-OS': 'M',
'app_token_key': '',
'sec-ch-ua': '"Chromium";v="112", "Google Chrome";v="112", "Not:A-Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'transactionId': '05202304231034048094',
})
json_data = {
"currentQueryType": "FLIGHT_LIST", "currentSegIndex": 0, "carryChd": False, "carryInf": False,
"productCodes": [],
"selectedRoutes": [], "productType": "CASH",
"routes": [{"arrCode": data['arrCode'], "depCode": data['depCode'], "flightDate": data['flightDate'],
"arrCodeType": data['at'], "depCodeType": data['dt'], "depCityName": data['depN'],
"arrCityName": data['arrN'], "segIndex": 0}],
"tripType": "OW", "cabinGrade": "", "memberLabel": "", "salesChannel": "7701", "moduleX": "mShopping",
"os": "M",
"appVersion": "99.0.0", "transactionId": "05202304231034048094"
}
a = json.dumps(json_data, separators=(',', ':'))
enc = execjs.compile(open('./demo.js', 'r', encoding='utf-8').read()).call('encrypt', a)
json_data = {
'req': enc
}
resp = self.ajax_request(url='https://m.ceair.com/m-base/sale/shopping', json_data=json_data)
data = resp['data'].get('flights') if resp['data'] else None
if data:
self.print_flights(list(self.process_json(data)))
else:
print('没有这一天的航班信息!!或者输入了国家')
def cookie_update(self, html):
"""
处理acw_sc的cookie更新
:param html: 网页源码
:return: None
"""
pattern1 = re.compile(r'.*?arg1=\'(.*?)\';')
pattern2 = re.compile(r'.*?setCookie\(\"(.*?)\".*?,.*?x\)')
arg1 = pattern1.findall(html)
name = pattern2.findall(html)
if name and arg1:
print('====开始处理acw_sc_v2====')
self.cookies[name[0]] = self.acw_sc_v2(arg1[0])
print('acw_sc_v2 =', self.cookies[name[0]])
print('====结束处理acw_sc_v2====')
else:
item = self.acw_sc_v3(html)
self.cookies['acw_tc'] = item[0]
self.cookies['acw_sc__v3'] = item[1]
@staticmethod
def acw_sc_v2(arg):
"""
处理acw_sc_v2
:param arg: 网页中获取到的实时参数
:return: acw_sc_v2的生成值
"""
return execjs.compile(open('./demo.js', 'r', encoding='utf-8').read()).call('getCookie', arg)
@staticmethod
def acw_sc_v3(html):
print('====开始处理滑块====')
with open('./slide.html', 'w', encoding='utf-8') as fp:
fp.write(html)
acw_tc = input('acw_tc: ')
acw_sc__v3: str = input('acw_sc__v3: ')
print('====结束处理滑块====')
return acw_tc, acw_sc__v3
@staticmethod
def process_json(flights):
for _flight in flights:
flightNoGroup = _flight['flightNoGroup']
depTime = _flight['depTime']
depDate = _flight['depDate']
depWeek = _flight['depWeek']
arrTime = _flight['arrTime']
arrDate = _flight['arrDate']
arrWeek = _flight['arrWeek']
depAirportName = _flight['depCityName'] + _flight['depAirportName'] + _flight['depTerminal']
arrAirportName = _flight['arrCityName'] + _flight['arrAirportName'] + _flight['arrTerminal']
transferStopInfos = '\n'.join(
[info['typeText'] + ',' + info['cityName'] + ',' + info['stopTime'] for info in
_flight['transferStopInfos']])
flightServices = '\n'.join(
[f"{info['flightNoGroup']}, {info['meal']}" for info in _flight['flightServices'] if info])
fares = '\n'.join(
[f'{info["baseCabinCodeText"]} : {info["salePrice"]}' for info in _flight['fares'] if info])
yield {
'flightNoGroup': flightNoGroup,
'depDate': depTime + ' ' + depDate + ' ' + depWeek,
'arrDate': arrTime + ' ' + arrDate + ' ' + arrWeek,
'depAirportName': depAirportName,
'arrAirportName': arrAirportName,
'transferStopInfos': transferStopInfos,
'flightServices': flightServices,
'fares': fares
}
@staticmethod
def print_flights(items):
tb = pt.PrettyTable()
tb.field_names = ['航班号', '出发时间', '到达时间', '出发机场', '到达机场', '中转信息', '是否含餐食', '价格']
tb.align = 'c'
# 填充宽度
tb.padding_width = 12
for item in items:
tb.add_row([item[i] for i in item])
print(tb)
@staticmethod
def get_city_code(city_name):
"""
获取城市编码
:param city_name: string
:return: city code
"""
while True:
try:
json_data = {
'methodName': 'searchairport',
'IsHighLight': True,
'Keyword': city_name,
'comeFrom': 'CEAIR_M',
'CountryType': None,
'salesChannel': '7701',
'moduleX': 'mShopping',
'os': 'M',
'appVersion': '99.0.0',
'transactionId': '0520230421151319554',
}
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
}
resp = \
requests.post('https://m.ceair.com/m-base/sale/getBasicData', headers=headers,
json=json_data).json()[
'data']['Data']['CityList']
if resp:
print(city_name + '---' + resp[0]['CityCode'])
return resp[0]['CityCode']
else:
return None
except:
time.sleep(2)
def check_value(value):
if not value:
return 'import sys\nprint("您的输入有误!退出程序...")\nsys.exit(1)'
if __name__ == '__main__':
flight = CEAir()
while True:
arr = input('输入到达城市: ')
# exec(check_value(arr))
dep = input('输入出发城市: ')
# exec(check_value(dep))
date = input('输入出发时间: ')
# exec(check_value(date))
flight.get_flight(arr=arr, dep=dep, date=date)

10602
js/wasm/air/ddd.js Normal file

File diff suppressed because one or more lines are too long

191
js/wasm/air/demo.js Normal file
View File

@ -0,0 +1,191 @@
const Module = require('./ddd.js')
var res = 'yX67CAY2RrMSJ1TxneWBANxXBK5wL6Rvk2bDRa+DKUspUee9v69x1s3TH1rv8tP4cJl5aQImurVCCHIQM4kz9xljjBV922sd1uSCv9xjnAKFy0lD9G2yVHb3kXpNLrB0eu7ITdyjo0Dxa0STnuTci4Bhci2TJ5dpsC20pGvgCeravpF32+Y+HnKZHjQG9/4fuCyODs6yYY42eAo7RP78abnihlRmIxWVFbMTb/6SYtqEycGkUtWyB1bTrXl+lpMXlfOMllQj7nUwNaS3DKNL+c2FPxVrcpbwdHtqm2icFIJA3UFFOUn6VrucV2oLNgbWMr2rKK75fSQu+nvLIoVCbN0hU4U2ccgAXVS2p7ZrUVa3smdoTBj+AUVBWgTBl70du36fCmHPdjUrpoI4v0+sLtkOTrGhGKs8LX6Tu8Gd9mNAcLYpdZmgA2ndgtTlF7Up8lYJBsFKLrKurO0EVWqcWQ4rZSoesJ0bwXmIzikqq04xcTM61oe7VAKBd33KZXh0FJsP/rRMwtRFXuvvvcnfAGAhvfRWh+pzSdPULo3s0mtTbiKM5P3rabCzimEAGCFbKg8cCfiO+PakuqZlkxWj+MdAWV58xfFksTKSxdhzNeQ6WzKn4g/gZzxJ0mM2nrxQ'
var input = '{"currentQueryType":"FLIGHT_LIST","currentSegIndex":0,"carryChd":false,"carryInf":false,"productCodes":[],"selectedRoutes":[],"productType":"CASH","routes":[{"arrCode":"BJS","depCode":"SHA","flightDate":"20230424","arrCodeType":"1","depCodeType":"1","depCityName":"上海","arrCityName":"北京","segIndex":0}],"tripType":"OW","cabinGrade":"","memberLabel":"","salesChannel":"7701","moduleX":"mShopping","os":"M","appVersion":"99.0.0","transactionId":"0520230421133633605"}'
var arg1 = '1BD9F59BC7D673529D64C4F4285144E6AA4B3127';
var k = [121, 96, 7, 103, 57, 95, 61, 124, 121, 96, 7, 103, 57, 95, 61, 124]
function getCookie(arg1) {
var _0x5e8b26 = '3000176000856006061501533003690027800375'
String['prototype']['hexXor'] = function (_0x4e08d8) {
var _0x5a5d3b = '';
for (var _0xe89588 = 0x0; _0xe89588 < this['length'] && _0xe89588 < _0x4e08d8['length']; _0xe89588 += 0x2) {
var _0x401af1 = parseInt(this['slice'](_0xe89588, _0xe89588 + 0x2), 0x10);
var _0x105f59 = parseInt(_0x4e08d8['slice'](_0xe89588, _0xe89588 + 0x2), 0x10);
var _0x189e2c = (_0x401af1 ^ _0x105f59)['toString'](0x10);
if (_0x189e2c['length'] === 0x1) {
_0x189e2c = '\x30' + _0x189e2c;
}
_0x5a5d3b += _0x189e2c;
}
return _0x5a5d3b;
}
String['prototype']['unsbox'] = function () {
var _0x4b082b = [0xf, 0x23, 0x1d, 0x18, 0x21, 0x10, 0x1, 0x26, 0xa, 0x9, 0x13, 0x1f, 0x28, 0x1b, 0x16, 0x17, 0x19, 0xd, 0x6, 0xb, 0x27, 0x12, 0x14, 0x8, 0xe, 0x15, 0x20, 0x1a, 0x2, 0x1e, 0x7, 0x4, 0x11, 0x5, 0x3, 0x1c, 0x22, 0x25, 0xc, 0x24];
var _0x4da0dc = [];
var _0x12605e = '';
for (var _0x20a7bf = 0x0; _0x20a7bf < this['\x6c\x65\x6e\x67\x74\x68']; _0x20a7bf++) {
var _0x385ee3 = this[_0x20a7bf];
for (var _0x217721 = 0x0; _0x217721 < _0x4b082b['length']; _0x217721++) {
if (_0x4b082b[_0x217721] === _0x20a7bf + 0x1) {
_0x4da0dc[_0x217721] = _0x385ee3;
}
}
}
_0x12605e = _0x4da0dc['\x6a\x6f\x69\x6e']('');
return _0x12605e;
};
var _0x23a392 = arg1['unsbox']();
arg2 = _0x23a392['hexXor'](_0x5e8b26);
return arg2
}
function decrypto(data) {
var sss = wbsk_AES_cbc_decrypt_base64(data, k)
a = JSON.parse(sss)
return sss
}
function encrypt(data) {
return wbsk_AES_cbc_encrypt_base64(data, k)
}
console.log(decrypto(res));
console.log(encrypt(input))
console.log(getCookie(arg1))
function wbsk_AES_cbc_decrypt_base64(input, iv) {
var tmp_input = base64ToArrayBuffer(input)
var result = wbsk_AES_cbc_decrypt(tmp_input, tmp_input.length, iv, iv.length);
return byteToString(result);
}
function wbsk_AES_cbc_decrypt(input, inlen, iv, ivlen) {
var tt = [];
var len = inlen;
var outadd = Module._malloc(len);
var output = Module.HEAP8.subarray(outadd, outadd + len);
var lenadd = Module._malloc(4);
var lenput = Module.HEAP32.subarray(lenadd / 4, lenadd / 4 + 1);
lenput[0] = len;
var CBCDecrypt = Module.cwrap('wbsk_AES_cbc_decrypt', 'number', ['array', 'number', 'number', 'number', 'array', 'number'])
var r = CBCDecrypt(new Uint8Array(input), inlen, outadd, lenadd, new Uint8Array(iv), ivlen);
var olen = lenput[0];
for (var key in output) {
tt.push(output[key]);
}
Module._free(outadd);
Module._free(lenadd);
return (tt.slice(0, olen));
}
function wbsk_AES_cbc_encrypt_base64(input, iv) {
var tmp_input = stringToByte(input);
var result = wbsk_AES_cbc_encrypt(tmp_input, tmp_input.length, iv, iv.length);
return arrayBufferToBase64(result);
}
function wbsk_AES_cbc_encrypt(input, inlen, iv, ivlen) {
var tt = [];
var len = (Math.floor(inlen / 16) + 1) * 16;
var outadd = Module._malloc(len);
var output = Module.HEAP8.subarray(outadd, outadd + len);
var lenadd = Module._malloc(4);
var lenput = Module.HEAP32.subarray(lenadd / 4, lenadd / 4 + 1);
lenput[0] = len;
var CBCEncrypt = Module.cwrap('wbsk_AES_cbc_encrypt', 'number', ['array', 'number', 'number', 'number', 'array', 'number'])
var r = CBCEncrypt(new Uint8Array(input), inlen, outadd, lenadd, new Uint8Array(iv), ivlen);
var olen = lenput[0];
for (var key in output) {
tt.push(output[key]);
}
Module._free(outadd);
Module._free(lenadd);
return (tt.slice(0, olen));
}
function stringToByte(str) {
var bytes = new Array();
var len, c;
len = str.length;
for (var i = 0; i < len; i++) {
c = str.charCodeAt(i);
if (c >= 0x010000 && c <= 0x10FFFF) {
bytes.push(((c >> 18) & 0x07) | 0xF0);
bytes.push(((c >> 12) & 0x3F) | 0x80);
bytes.push(((c >> 6) & 0x3F) | 0x80);
bytes.push((c & 0x3F) | 0x80);
} else if (c >= 0x000800 && c <= 0x00FFFF) {
bytes.push(((c >> 12) & 0x0F) | 0xE0);
bytes.push(((c >> 6) & 0x3F) | 0x80);
bytes.push((c & 0x3F) | 0x80);
} else if (c >= 0x000080 && c <= 0x0007FF) {
bytes.push(((c >> 6) & 0x1F) | 0xC0);
bytes.push((c & 0x3F) | 0x80);
} else {
bytes.push(c & 0xFF);
}
}
return bytes;
}
function byteToString(arr) {
if (typeof arr === 'string') {
return arr;
}
var str = '',
_arr = arr;
for (var i = 0; i < _arr.length; i++) {
var one = (_arr[i] & 0xff).toString(2),
v = one.match(/^1+?(?=0)/);
if (v && one.length == 8) {
var bytesLength = v[0].length;
var store = (_arr[i] & 0xff).toString(2).slice(7 - bytesLength);
for (var st = 1; st < bytesLength; st++) {
store += (_arr[st + i] & 0xff).toString(2).slice(2);
}
str += String.fromCharCode(parseInt(store, 2));
i += bytesLength - 1;
} else {
str += String.fromCharCode(_arr[i]);
}
}
return str;
}
function arrayBufferToBase64(buffer) {
var binary = '';
var bytes = new Uint8Array(buffer);
var len = bytes.byteLength;
for (var i = 0; i < len; i++) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
}
function base64ToArrayBuffer(base64) {
var binary_string = atob(base64);
var len = binary_string.length;
var bytes = new Uint8Array(len);
for (var i = 0; i < len; i++) {
bytes[i] = binary_string.charCodeAt(i);
}
return bytes;
}

332
js/wasm/air/slide.html Normal file
View File

@ -0,0 +1,332 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>滑动验证页面</title>
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<style type="text/css">
html, body, div, span, p{
margin:0;
padding:0;
border:0;
outline:0;
font-size:100%;
vertical-align:baseline;
background:transparent;
}
body{
background: #fff;
}
</style>
<script type="text/javascript">
if (window.console === undefined){
console = {};
console.log = function(){};
};
window._waf_is_mobile = false;
(function(a) {
if (/(android|bb\d+|meego).+mobile|avantgo|bada\/|blackberry|blazer|compal|elaine|fennec|hiptop|iemobile|ip(hone|od)|iris|kindle|lge |maemo|midp|mmp|mobile.+firefox|netfront|opera m(ob|in)i|palm( os)?|phone|p(ixi|re)\/|plucker|pocket|psp|series(4|6)0|symbian|treo|up\.(browser|link)|vodafone|wap|windows ce|xda|xiino/i.test(a) || /1207|6310|6590|3gso|4thp|50[1-6]i|770s|802s|a wa|abac|ac(er|oo|s\-)|ai(ko|rn)|al(av|ca|co)|amoi|an(ex|ny|yw)|aptu|ar(ch|go)|as(te|us)|attw|au(di|\-m|r |s )|avan|be(ck|ll|nq)|bi(lb|rd)|bl(ac|az)|br(e|v)w|bumb|bw\-(n|u)|c55\/|capi|ccwa|cdm\-|cell|chtm|cldc|cmd\-|co(mp|nd)|craw|da(it|ll|ng)|dbte|dc\-s|devi|dica|dmob|do(c|p)o|ds(12|\-d)|el(49|ai)|em(l2|ul)|er(ic|k0)|esl8|ez([4-7]0|os|wa|ze)|fetc|fly(\-|_)|g1 u|g560|gene|gf\-5|g\-mo|go(\.w|od)|gr(ad|un)|haie|hcit|hd\-(m|p|t)|hei\-|hi(pt|ta)|hp( i|ip)|hs\-c|ht(c(\-| |_|a|g|p|s|t)|tp)|hu(aw|tc)|i\-(20|go|ma)|i230|iac( |\-|\/)|ibro|idea|ig01|ikom|im1k|inno|ipaq|iris|ja(t|v)a|jbro|jemu|jigs|kddi|keji|kgt( |\/)|klon|kpt |kwc\-|kyo(c|k)|le(no|xi)|lg( g|\/(k|l|u)|50|54|\-[a-w])|libw|lynx|m1\-w|m3ga|m50\/|ma(te|ui|xo)|mc(01|21|ca)|m\-cr|me(rc|ri)|mi(o8|oa|ts)|mmef|mo(01|02|bi|de|do|t(\-| |o|v)|zz)|mt(50|p1|v )|mwbp|mywa|n10[0-2]|n20[2-3]|n30(0|2)|n50(0|2|5)|n7(0(0|1)|10)|ne((c|m)\-|on|tf|wf|wg|wt)|nok(6|i)|nzph|o2im|op(ti|wv)|oran|owg1|p800|pan(a|d|t)|pdxg|pg(13|\-([1-8]|c))|phil|pire|pl(ay|uc)|pn\-2|po(ck|rt|se)|prox|psio|pt\-g|qa\-a|qc(07|12|21|32|60|\-[2-7]|i\-)|qtek|r380|r600|raks|rim9|ro(ve|zo)|s55\/|sa(ge|ma|mm|ms|ny|va)|sc(01|h\-|oo|p\-)|sdk\/|se(c(\-|0|1)|47|mc|nd|ri)|sgh\-|shar|sie(\-|m)|sk\-0|sl(45|id)|sm(al|ar|b3|it|t5)|so(ft|ny)|sp(01|h\-|v\-|v )|sy(01|mb)|t2(18|50)|t6(00|10|18)|ta(gt|lk)|tcl\-|tdg\-|tel(i|m)|tim\-|t\-mo|to(pl|sh)|ts(70|m\-|m3|m5)|tx\-9|up(\.b|g1|si)|utst|v400|v750|veri|vi(rg|te)|vk(40|5[0-3]|\-v)|vm40|voda|vulc|vx(52|53|60|61|70|80|81|83|85|98)|w3c(\-| )|webc|whit|wi(g |nc|nw)|wmlb|wonu|x700|yas\-|your|zeto|zte\-/i.test(a.substr(0, 4))){
window._waf_is_mobile = true;
}
})(navigator.userAgent || navigator.vendor || window.opera);
</script>
<!-- 海外页面加载此js -->
<!-- <script type="text/javascript" charset="utf-8" src="//aeis.alicdn.com/sd/ncpc/nc.js?t=2015052012"></script> -->
</head>
<body>
<div id="PC" style="display: none">
<div class="contentbg">
<div class="content">
<div class="left"></div>
<div class="right">
<h1>访问验证</h1>
<p>别离开,为了更好的访问体验,请滑动滑块进行验证,通过后即可继续访问网页</p>
<div id="nocaptcha" class="nc-container"></div>
</div>
</div>
</div>
<div id="_umfp" style="display:inline;width:1px;height:1px;overflow:hidden"></div>
</div>
<div id="H5" style="display: none;">
<div class="waf-nc-h5-mask"></div>
<div id="WAF_NC_H5_WRAPPER" class="waf-nc-h5-wrapper">
<div class="waf-nc-h5-panel">
<img class="waf-nc-h5-icon" src="//img.alicdn.com/imgextra/i1/O1CN01L12MaQ1ZwfYKk7Yrc_!!6000000003259-2-tps-900-594.png" alt="" height="132" width="200">
<div class="waf-nc-h5-description">为了更好的访问体验,请滑动滑块进行验证</div>
</div>
<div id="h5_nocaptcha" class="nc-container" data-nc-idx="1"></div>
</div>
</div>
<div style="margin-left:20px" id="traceid">TraceID: 76b20fe716897324658686923e7d50</div>
</div>
</body>
<script type="text/javascript">
var requestInfo = {
type: 'POST', // 'GET' 和 'POST'
url: 'https://m.ceair.com/m-base/sale/shopping', // 'https://www.taobao.com/detail'
args: '',
data: '{"req": "yX67CAY2RrMSJ1TxneWBANxXBK5wL6Rvk2bDRa DKUspUee9v69x1s3TH1rv8tP4cJl5aQImurVCCHIQM4kz9xljjBV922sd1uSCv9xjnAKFy0lD9G2yVHb3kXpNLrB0eu7ITdyjo0Dxa0STnuTci4Bhci2TJ5dpsC20pGvgCeravpF32 Y HnKZHjQG9/4fuCyODs6yYY42eAo7RP78abnihlRmIxWVFbMTb/6SYtqEycGkUtWyB1bTrXl lpMXlfOMllQj7nUwNaS3DKNL b25tZGFQATiHpx X 5//zSrvAGMKDhqWPiGGOpETy2hjTWVdCSg30GiXLi1JVHPgb8XwWNx7P8vSQwRR5yWsOwOS4VlTS48Bn7uCRs25XhSkkMcPUzMh4CNScCvQsK4b1ecjup7j7S/sk8iGwSQjmXoVplQthl/CT35Rrd0LEHtn6xESx436EYQJzafDdweVi7CbFlFr2tzykvHlNsj/iYxC2R6PuGQYhcTGkOXQ36MT1oeSdMHuDYDicdwUQiTmCUhxg2vAHvtSYmOXfnLESxBoKefeLXy0IDmAiHrK8riamcKKMCxQkj3S29GKKM/a/kH3nu8t7KlM55EaP/B7cExqbwS2WfAesDUlxbCaivh5sar3KLxlYEeS2rs4ltFKQ=="}', //a=1&b=2&c=3...
token: '1032574b-c24f-4eff-aba3-cf9f3fd317fc',
refer: 'ejy+YRfrjq63pIq12Exo1rDW6sI=',
headers: {"Content-Type":"application/json"},
};
function insertScripts(){
var script = document.createElement('script');
var time = new Date();
var head = document.head || document.getElementsByTagName('head')[0];
script.src = '//g.alicdn.com/AWSC/AWSC/awsc.js?t=' + (time.getFullYear()+(time.getMonth()+1)+time.getDate()+time.getHours());
if ("onload" in script) {
script.onload = function(){
initNC();
}
} else {
script.onreadystatechange = function() {
if (/loaded|complete/.test(script.readyState)) {
initNC();
}
};
}
head.appendChild(script);
//加载各自的css
var style = document.createElement('style');
style.type = "text/css";
var css = '';
if (_waf_is_mobile){
css = '#waf_nc_h5_block{position:fixed;_position:absolute;width:100%;height:100%;top:0;bottom:0;left:0;z-index:9999}.waf-nc-h5-mask{background:#777;opacity:.5;filter:alpha(opacity=50);width:100%;height:100%}.waf-nc-h5-wrapper{width:94%;position:absolute;top:20%;left:50%;margin-top:-20%;margin-left:-47%;padding:5% 1%;background:#fff;border-radius:3px;box-sizing:border-box}.waf-nc-h5-panel{width:100%;text-align:center}.waf-nc-h5-icon{margin:0 auto}.waf-nc-h5-description{margin-top:40px;font-size:14px;color:#595959}#traceid{text-align:center;margin-top:500px!important;font-size:12px;color:#999}.nc-container{margin-top:30px}.nc_bg{background:#fff3ea!important}.btn_slide{background:#ff6a00!important;border:0!important;color:#fff!important;width:48px!important;height:48px!important;font-size:30px;font-weight:900!important;line-height:48px!important}.btn_ok{background:#ff6a00!important;border:0!important;color:#fff!important;width:48px!important;height:48px!important;font-size:30px;font-weight:900!important;line-height:48px!important}.nc_scale{background:rgba(241,241,242,1)!important;height:48px!important}.nc-lang-cnt{height:48px!important;margin-left:10px!important;line-height:48px!important;font-size:14px!important}.nc_1_nocaptcha{width:300px!important;height:48px!important;margin:auto!important;left:0!important;right:0!important}'
}else{
css = '.head{position:relative;height:70px;padding-left:25px;border-bottom:1px solid #ebecec}.content{width:1000px;min-height:250px;margin-top:164px!important;margin-left:auto;margin-right:auto}.left{width:300px;height:198px;float:left;background:url(https://img.alicdn.com/imgextra/i1/O1CN01L12MaQ1ZwfYKk7Yrc_!!6000000003259-2-tps-900-594.png) no-repeat;background-size:cover}.right{margin-left:250px;padding-left:140px}.contentbg{width:100%;min-height:250px}.right p{font-size:14px;color:#333;line-height:25px;height:25px;text-align:left}#nocaptcha{margin-top:40px;margin-left:5px;width:300px!important}#nocaptcha .nc-lang-cnt{color:#fff}#nocaptcha .clickCaptcha_text .nc-lang-cnt{color:#333}#traceid{margin-left:250px!important;padding-left:140px!important;font-size:14px;color:rgba(153,153,153,1)}.nc_bg{background:#fff3ea!important}.btn_slide{background:#ff6a00!important;border:0!important;color:#fff!important;width:48px!important;height:48px!important;font-size:30px;font-weight:900!important;line-height:48px!important}.btn_ok{background:#ff6a00!important;border:0!important;color:#fff!important;width:48px!important;height:48px!important;font-size:30px;font-weight:900!important;line-height:48px!important}.nc_scale{background:rgba(241,241,242,1)!important;height:48px!important}.nc-lang-cnt{height:48px!important;margin-left:10px!important;line-height:48px!important;font-size:14px!important}.nc_1_nocaptcha{width:300px!important;height:48px!important}'
}
try {
style.appendChild(document.createTextNode(css));
}catch(e){
style.styleSheet.cssText = css;
}
var head = document.head || document.getElementsByTagName('head')[0];
head.appendChild(style);
}
insertScripts();
function parseURL(url) {
var search_index = url.indexOf('?'),
hash_index = url.indexOf('#');
var base, search, hash;
try{
if (search_index < 0 || (hash_index > -1 && search_index > hash_index)){
if (hash_index < 0){
base = url;
search = '';
hash = '';
}else{
base = url.slice(0, hash_index);
search = '';
hash = url.slice(hash_index, url.length);
}
}else{
if (hash_index < 0){
base = url.slice(0, search_index);
search = url.slice(search_index, url.length);
hash = '';
}else{
base = url.slice(0, search_index);
search = url.slice(search_index, hash_index);
hash = url.slice(hash_index, url.length);
}
}
}catch(e){
base = url;
search = '';
hash = '';
}
return {
base: base,
search: search,
hash: hash,
original: url
}
}
function parseQuery(qstr) {
if (qstr.charAt(0) != '?') {
return {};
}
var query = {};
var a = qstr.substr(1).split('&');
for (var i = 0; i < a.length; i++) {
var b = a[i].split('=');
console.log(decodeURIComponent(b[0]))
if (decodeURIComponent(b[0]) !== 'u_asec'){
query[decodeURIComponent(b[0])] = decodeURIComponent(b[1] || '');
}
}
return query;
}
function addQuery(query, data) {
var qdata = parseQuery(query);
var rt = '?';
for (var i in data) {
qdata[i] = data[i];
}
for (var i in qdata) {
rt += encodeURIComponent(i) + '=' + encodeURIComponent(qdata[i]) + '&';
}
rt = rt.substr(0 , rt.length - 1);
return rt;
}
function combineUrl(parsedUrl) {
return parsedUrl.base + parsedUrl.search + parsedUrl.hash;
}
function parseFormQuery(qstr) {
if (qstr.length === 0 || qstr.indexOf('=') < 0){
return [];
}
var formItems = [];
var a = qstr.split('&');
for (var i = 0; i < a.length; i++) {
var b = a[i].split('=');
var str = '<input type="hidden" name="' + b[0] + '" value="' + b[1] + '" />'
formItems.push(str);
}
return formItems;
}
function reform(data) {
var form = document.createElement('form');
var parsedUrl = parseURL(requestInfo.url);
parsedUrl.search = addQuery(parsedUrl.search,data)
var newUrl = combineUrl(parsedUrl);
form.action = newUrl;
form.method = "POST";
form.innerHTML = parseFormQuery(requestInfo.data).join('');
document.body.appendChild(form);
form.submit();
// document.body.appendChild(form);
}
var NC_Opt = {
renderTo: "nocaptcha",//渲染到DOM ID
appkey: "CF_APP_WAF", // 应用标识
scene: "register",
trans: {"key1": "code100", "user": "default"},
token: requestInfo.token,//umid token
language: "cn",//语言包,默认中文
isEnabled: true,
times: 3,
success: function (data) {
document.getElementById("nc_1_n1z").innerHTML='✓'
document.getElementsByClassName('nc-lang-cnt')[0].innerHTML='<b style="margin-left:60px;-webkit-text-fill-color:#FF6A00 !important">验证成功!</b>'
if (requestInfo.type === 'GET'){
var d = {
u_atoken: data.token,
u_asession: data.sessionId,
u_asig: data.sig,
u_aref: requestInfo.refer
};
// location.href = requestInfo.url + addQuery(requestInfo.data, d);
var parsedUrl = parseURL(requestInfo.url);
parsedUrl.search = addQuery(parsedUrl.search,d)
// location.href = combineUrl(parsedUrl);
location.replace(combineUrl(parsedUrl));
}else{
var d = {
u_atoken: data.token,
u_asession: data.sessionId,
u_asig: data.sig,
u_aref: requestInfo.refer
};
reform(d);
}
},
// 滑动验证失败时触发该回调参数。
fail: function (failCode) {
window.console && console.log(s);
},
// 验证码加载出现异常时触发该回调参数。
error: function (errorCode) {
window.console && console.log(errorCode)
}
};
var NC_h5_Opt = {
renderTo: "h5_nocaptcha",//渲染到DOM ID
appkey: "CF_APP_WAF", // 应用标识
scene: "register_h5",
trans: {"key1": "code200", "user": "default"},
token: requestInfo.token,//umid token
language: "cn",//语言包,默认中文
isEnabled: true,
times: 3,
success: function (data) {
document.getElementById("nc_1_n1z").innerHTML='✓'
document.getElementsByClassName('nc-lang-cnt')[0].innerHTML='<b style="margin-left:60px;-webkit-text-fill-color:#FF6A00 !important">验证成功!</b>'
if (data.token === undefined) data.token = requestInfo.token;
if (requestInfo.type === 'GET'){
var d = {
u_atoken: data.token,
u_asession: data.sessionId,
u_asig: data.sig,
u_aref: requestInfo.refer
};
// location.href = requestInfo.url + addQuery(requestInfo.data, d);
var parsedUrl = parseURL(requestInfo.url);
parsedUrl.search = addQuery(parsedUrl.search,d)
// location.href = combineUrl(parsedUrl);
location.replace(combineUrl(parsedUrl));
}else{
var d = {
u_atoken: data.token,
u_asession: data.sessionId,
u_asig: data.sig,
u_aref: requestInfo.refer
};
reform(d);
}
},
// 滑动验证失败时触发该回调参数。
fail: function (failCode) {
window.console && console.log(s);
},
// 验证码加载出现异常时触发该回调参数。
error: function (errorCode) {
window.console && console.log(errorCode)
}
};
function initNC() {
if (window._waf_is_mobile){
document.getElementById('H5').style.display = 'block';
// NoCaptcha.init(NC_h5_Opt);
// NoCaptcha.setEnabled(true);
AWSC.use("nc", function (state, module) {
window.nc = module.init(NC_h5_Opt);
})
}else{
document.getElementById('PC').style.display = 'block';
// var nc = new noCaptcha(NC_Opt);
AWSC.use("nc", function (state, module) {
window.nc = module.init(NC_Opt);
})
}
}
</script>
</html>

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,48 @@
import requests
import execjs
pageNum = 1
# 控制请求的页数
while pageNum < 2:
# 准备js逆向出请求头和表单签名
ts = int(execjs.compile(open('./sign.js', 'r', encoding='utf-8').read()).call('ts'))
json_data = {
'pageNo': pageNum,
'pageSize': 40,
'total': 5770,
'AREACODE': '',
'M_PROJECT_TYPE': '',
'KIND': 'GCJS',
'GGTYPE': '1',
'PROTYPE': '',
'timeType': '6',
'BeginTime': '2022-07-18 00:00:00',
'EndTime': '2023-01-18 23:59:59',
'createTime': [],
'ts': ts,
}
sign = str(execjs.compile(open('./sign.js', 'r', encoding='utf-8').read()).call('sign', json_data))
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
'Content-Type': 'application/json;charset=UTF-8',
'Origin': 'https://ggzyfw.fujian.gov.cn',
'Referer': 'https://ggzyfw.fujian.gov.cn/business/list/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'portal-sign': sign,
'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
# 发起请求
response = requests.post('https://ggzyfw.fujian.gov.cn/FwPortalApi/Trade/TradeInfo', headers=headers, json=json_data).json()
data = response['Data']
# 解密文件
ctx = execjs.compile(open('./demo.js', 'r', encoding='utf-8').read()).call('decrypt', data)
print(ctx)
pageNum += 1

View File

@ -0,0 +1,58 @@
const Crypto = require('crypto')
var d = "3637CB36B2E54A72A7002978D0506CDF"
function sign(t) {
for (var e in t)
"" !== t[e] && void 0 !== t[e] || delete t[e];
var n = d + l(t);
return s(n)
}
function s(e) {
return md5(e)
}
function l(t) {
for (var e = Object.keys(t).sort(u), n = "", a = 0; a < e.length; a++)
if (void 0 !== t[e[a]])
if (t[e[a]] && t[e[a]]instanceof Object || t[e[a]]instanceof Array) {
var i = JSON.stringify(t[e[a]]);
n += e[a] + i
} else
n += e[a] + t[e[a]];
return n
}
// 创建标准md5算法
function md5(text){
return Crypto.createHash('md5').update(text).digest('hex')
}
function u(t, e) {
return t.toString().toUpperCase() > e.toString().toUpperCase() ? 1 : t.toString().toUpperCase() == e.toString().toUpperCase() ? 0 : -1
}
// 测试数据
data = {
'pageNo': 1,
'pageSize': 20,
'total': 0,
'AREACODE': '',
'M_PROJECT_TYPE': '',
'KIND': 'GCJS',
'GGTYPE': '1',
'PROTYPE': '',
'timeType': '6',
'BeginTime': '2022-07-18 00:00:00',
'EndTime': '2023-01-18 23:59:59',
'createTime': [],
'ts': ts(),
}
// 生成时间戳
function ts(){
return (new Date).getTime()
}
console.log(ts())
console.log(sign(data))

View File

@ -0,0 +1,50 @@
import requests
import execjs
import time
headers = {
'authority': 'api.599.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'no-cache',
'origin': 'https://599.com',
'pragma': 'no-cache',
'referer': 'https://599.com/',
'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
}
ts = int(time.time() * 1000)
pre_params = {
"appType": "3",
"channelNumber": "GF1001",
"comId": "8",
"lang": "zh",
"platform": "pc",
"st": ts,
"timeZone": "8",
"version": "671",
"versionCode": "671"
}
sign = execjs.compile(open('./js/sss.js', 'r', encoding='utf-8').read()).call('Z', '/footballapi/core/matchlist/v2/immediate', pre_params)
params = {
'comId': '8',
'lang': 'zh',
'timeZone': '8',
'version': '671',
'versionCode': '671',
'channelNumber': 'GF1001',
'platform': 'pc',
'appType': '3',
'st': str(ts),
'sign': sign,
}
response = requests.get('https://api.599.com/footballapi/core/matchlist/v2/immediate', params=params, headers=headers)
data = response.json()['data']
ctx = execjs.compile(open('./js/demo.js', 'r', encoding='utf-8').read()).call('decrypt', data)
print(ctx)

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,37 @@
const crypto = require('crypto-js')
function md5(text){
text = String(text)
return crypto.MD5(text).toString()
}
var e = '/footballapi/core/matchlist/v2/immediate'
var t = {
"appType": "3",
"channelNumber": "GF1001",
"comId": "8",
"lang": "zh",
"platform": "pc",
"st": 1678167676726,
"timeZone": "8",
"version": "671",
"versionCode": "671"
}
function l() {
return e
}
function Z(e, t) {
var n = {}
, o = e;
for (var r in Object.keys(t).sort().map((function(e) {
n[e] = t[e]
}
)),
n)
o = o + r + n[r];
return o += md5("wjj"),
md5(o).toLowerCase() + "99"
}
console.log(Z(e, t));

View File

@ -0,0 +1,209 @@
const CryptoJS = require('crypto-js')
const askEgGnDlalR = "ajmEmqsokwfpfWv8";//AESkey可自定义
const asi5jI3cvFQI = "bH7Ppp3nOF5k5PCt";//密钥偏移量IV可自定义
const ackbiFPNKGDI = "dE1E6BPpAF5gwUEN";//AESkey可自定义
const aci1c2jlP3KO = "fOf1MjRiLdsmtenp";//密钥偏移量IV可自定义
const dsk80WzdMTMv = "h8ByxcqtKbzeqa3q";//DESkey可自定义
const dsiCs366A1HA = "xJqk2s8ZDjgDHbBN";//密钥偏移量IV可自定义
const dcku3b1jsXMn = "oHwOHptKV6TukKXJ";//DESkey可自定义
const dciQs6k7qfCc = "prKerE8BHOzo9jvI";//密钥偏移量IV可自定义
function md5(text) {
text = String(text)
return CryptoJS.MD5(text).toString()
}
var BASE64 = {
encrypt: function (text) {
return CryptoJS.enc.Base64.stringify(CryptoJS.enc.Utf8.parse(text));
},
decrypt: function (text) {
return CryptoJS.enc.Base64.parse(text).toString(CryptoJS.enc.Utf8);
}
};
var DES = {
encrypt: function (text, key, iv) {
var secretkey = (CryptoJS.MD5(key).toString()).substr(0, 16);
var secretiv = (CryptoJS.MD5(iv).toString()).substr(24, 8);
secretkey = CryptoJS.enc.Utf8.parse(secretkey);
secretiv = CryptoJS.enc.Utf8.parse(secretiv);
var result = CryptoJS.DES.encrypt(text, secretkey, {
iv: secretiv,
mode: CryptoJS.mode.CBC,
padding: CryptoJS.pad.Pkcs7
});
return result.toString();
},
decrypt: function (text, key, iv) {
var secretkey = (CryptoJS.MD5(key).toString()).substr(0, 16);
var secretiv = (CryptoJS.MD5(iv).toString()).substr(24, 8);
secretkey = CryptoJS.enc.Utf8.parse(secretkey);
secretiv = CryptoJS.enc.Utf8.parse(secretiv);
var result = CryptoJS.DES.decrypt(text, secretkey, {
iv: secretiv,
mode: CryptoJS.mode.CBC,
padding: CryptoJS.pad.Pkcs7
});
return result.toString(CryptoJS.enc.Utf8);
}
};
var AES = {
encrypt: function (text, key, iv) {
var secretkey = (CryptoJS.MD5(key).toString()).substr(16, 16);
var secretiv = (CryptoJS.MD5(iv).toString()).substr(0, 16);
secretkey = CryptoJS.enc.Utf8.parse(secretkey);
secretiv = CryptoJS.enc.Utf8.parse(secretiv);
var result = CryptoJS.AES.encrypt(text, secretkey, {
iv: secretiv,
mode: CryptoJS.mode.CBC,
padding: CryptoJS.pad.Pkcs7
});
return result.toString();
},
decrypt: function (text, key, iv) {
var secretkey = (CryptoJS.MD5(key).toString()).substr(16, 16);
var secretiv = (CryptoJS.MD5(iv).toString()).substr(0, 16);
secretkey = CryptoJS.enc.Utf8.parse(secretkey);
secretiv = CryptoJS.enc.Utf8.parse(secretiv);
var result = CryptoJS.AES.decrypt(text, secretkey, {
iv: secretiv,
mode: CryptoJS.mode.CBC,
padding: CryptoJS.pad.Pkcs7
});
return result.toString(CryptoJS.enc.Utf8);
}
};
function osmThj4lKY(obj) {
var newObject = {};
Object.keys(obj).sort().map(function (key) {
newObject[key] = obj[key];
});
return newObject;
}
function getParams(city, salt, a7, a8) {
var _city = {city: city}
var mP227jOOD = "GETMONTHDATA"
var a6Eh = salt;
var cT4un = 'WEB';
var tfTWU9k = new Date().getTime();
var peqbJNB = {
appId: a6Eh,
method: mP227jOOD,
timestamp: tfTWU9k,
clienttype: cT4un,
object: _city,
secret: md5(a6Eh + mP227jOOD + tfTWU9k + cT4un + JSON.stringify(osmThj4lKY(_city)))
};
peqbJNB = BASE64.encrypt(JSON.stringify(peqbJNB));
peqbJNB = DES.encrypt(peqbJNB, a7, a8);
return peqbJNB;
}
function type1(city, salt) {
var _city = {city: city}
var mP227jOOD = "GETMONTHDATA"
var a6Eh = salt;
var cT4un = 'WEB';
var tfTWU9k = new Date().getTime();
var peqbJNB = {
appId: a6Eh,
method: mP227jOOD,
timestamp: tfTWU9k,
clienttype: cT4un,
object: _city,
secret: md5(a6Eh + mP227jOOD + tfTWU9k + cT4un + JSON.stringify(osmThj4lKY(_city)))
};
peqbJNB = BASE64.encrypt(JSON.stringify(peqbJNB));
return peqbJNB;
}
function type2(city, salt, a7, a8) {
var _city = {city: city}
var mP227jOOD = "GETMONTHDATA"
var a6Eh = salt;
var cT4un = 'WEB';
var tfTWU9k = new Date().getTime();
var peqbJNB = {
appId: a6Eh,
method: mP227jOOD,
timestamp: tfTWU9k,
clienttype: cT4un,
object: _city,
secret: md5(a6Eh + mP227jOOD + tfTWU9k + cT4un + JSON.stringify(osmThj4lKY(_city)))
};
peqbJNB = BASE64.encrypt(JSON.stringify(peqbJNB));
peqbJNB = DES.encrypt(peqbJNB, a7, a8);
return peqbJNB;
}
function type3(city, salt, a1, a2) {
var _city = {city: city}
var mP227jOOD = "GETMONTHDATA"
var a6Eh = salt;
var cT4un = 'WEB';
var tfTWU9k = new Date().getTime();
var peqbJNB = {
appId: a6Eh,
method: mP227jOOD,
timestamp: tfTWU9k,
clienttype: cT4un,
object: _city,
secret: md5(a6Eh + mP227jOOD + tfTWU9k + cT4un + JSON.stringify(osmThj4lKY(_city)))
};
peqbJNB = BASE64.encrypt(JSON.stringify(peqbJNB));
peqbJNB = AES.encrypt(peqbJNB, a1, a2);
return peqbJNB;
}
function decrypt(data, a1, a2, a5, a6) {
data = BASE64.decrypt(data);
data = DES.decrypt(data, a5, a6);
data = AES.decrypt(data, a1, a2);
data = BASE64.decrypt(data);
return JSON.parse(data)
}
function file(p, a, c, k, e, d) {
e = function (c) {
return (c < a ? '' : e(parseInt(c / a))) + ((c = c % a) > 35 ? String.fromCharCode(c + 29) : c.toString(36))
}
;
if (!''.replace(/^/, String)) {
while (c--) {
d[e(c)] = k[c] || e(c)
}
k = [function (e) {
return d[e]
}
];
e = function () {
return '\\w+'
}
;
c = 1
}
;
while (c--) {
if (k[c]) {
p = p.replace(new RegExp('\\b' + e(c) + '\\b', 'g'), k[c])
}
}
return p
}
function get_enc(data) {
return eval('file(' + data + ')')
}

View File

@ -0,0 +1,179 @@
import base64
import re
import time
from collections import Counter
import execjs
import requests
def remove_par(pat, string) -> (int, str):
"""
:param pat: 需要过滤的字符
:param string: 需要过滤的字符串
:return: 匹配到的字符数以及过滤后的文本
"""
pat = '{}'.format(pat)
count = len(re.findall(pat, string))
result = re.sub(pat, '', string)
return count, result
def get_re_all(pat, string) -> (list, list):
"""
:param pat: 正则表达式
:param string: 匹配的字符串
:return: 匹配的名字以及匹配的值
"""
matches = re.findall(pat, string)
variables = [match[1] for match in matches]
variable_names = [match[0] for match in matches]
return variables, variable_names
def get_re_search(pat, string) -> str:
"""
:param pat: 正则表达式
:param string: 匹配的字符串
:return: 匹配到的结果
"""
match = re.search(pat, string)
if match:
key = match.group(1)
return key
else:
return ''
class weatherCrawler:
def __init__(self):
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Referer': 'https://www.aqistudy.cn/historydata/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
self.cookies = {
'Hm_lvt_6088e7f72f5a363447d4bafe03026db8': '1689668701',
'Hm_lpvt_6088e7f72f5a363447d4bafe03026db8': str(int(time.time())),
}
self.city: str = '' # 获取的城市名称
self.file: str = '' # 动态js
self.salt = None # 获取md加密的盐
self.par = None # 获取发起post请求的data的key
self.key_iv: list = [] # 获取动态加密用的iv
self.key_name: list = [] # 获取动态的的密钥iv名用于统计类型
self.param: str = '' # 用于发送请求的参数
def __get_file(self):
"""
这个函数就是获取到最终的动态js
"""
params = {
'city': self.city,
}
response = requests.get('https://www.aqistudy.cn/historydata/monthdata.php', params=params,
cookies=self.cookies, headers=self.headers)
# 第一次寻找出动态js的动态链接
match = get_re_search(r'<script[^>]*src="[^"]*\/([^\/?]+)\?t=[^"]+"', response.text)
# 如果找到了
if match:
filename = 'https://www.aqistudy.cn/historydata/resource/js/' + match
html = requests.get(filename, headers=self.headers).text
# 执行第一次的动态js获取动态加载的参数
pattern = r'eval\(function\(p,a,c,k,e,d\){.*?}return p}'
_, filtered_html = remove_par(pattern, html)
a = execjs.compile(open('getParams.js', 'r', encoding='utf-8').read()) \
.call('get_enc', filtered_html[1:-3])
# 计算其中是否有eval函数如果有则要重新运行
count, a = remove_par('eval', a)
if count > 0:
# 获取执行解密base64的次数目前观察1~2次不等
count, a = remove_par('dweklxde', a)
# 去除括号
result = a.replace("(", "").replace(")", "").replace("'", '')
# 得到完整的动态js
self.file = self.multiple_base64_decode(result, count)
else:
# 没有eval说明就是完整的js
self.file = a
# 否则抛出错误
else:
raise '没有找到动态js文件'
def __get_params(self):
"""
这个函数是获取变化的参数依次是变化的key和iv以及他们的名字变化的盐变化的请求体的键
"""
key_iv, key_name = get_re_all(r'const\s*(\w+)\s*=\s*"([^"]+)"', self.file)
par = get_re_search(r'data:\s*{\s*(\w+)\s*:\s*\w+\s*}', self.file)
salt = get_re_search(r'var\s*\w+\s*=\s*\'(.*?)\'', self.file)
self.key_name = key_name
self.key_iv = key_iv
self.par = par
self.salt = salt
def __calculate_type(self):
"""
统计出需要加密的类型然后得到请求体的加密参数
"""
punctuations = '''!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~'''
# 解构密钥向量的名字,用于统计
a1_name, a2_name, a3_name, a4_name, a5_name, a6_name, a7_name, a8_name = self.key_name
count_keys = [a3_name, a4_name, a7_name, a8_name]
translator = str.maketrans(punctuations, ' ' * len(punctuations))
# 统计js中密钥出现的次数来决定加密的类型
counters = Counter(self.file.translate(translator).split())
counts = [counters[key] for key in count_keys]
# 下面是三种不同的类型
if counts == [1, 1, 1, 1]:
self.param = execjs.compile(open('getParams.js', 'r', encoding='utf-8').read()) \
.call('type1', self.city, self.salt)
elif counts == [1, 1, 2, 2]:
self.param = execjs.compile(open('getParams.js', 'r', encoding='utf-8').read()) \
.call('type2', self.city, self.salt, self.key_iv[6], self.key_iv[7])
elif counts == [2, 2, 1, 1]:
self.param = execjs.compile(open('getParams.js', 'r', encoding='utf-8').read()) \
.call('type3', self.city, self.salt, self.key_iv[2], self.key_iv[3])
else:
# 出现新的类型查看一下然后修改前的if条件
print(counts)
raise self.file
def __do_post(self):
"""
进行发送请求然后解密请求数据获取到我们需要的
"""
# 结构密钥和IV
a1, a2, a3, a4, a5, a6, a7, a8 = self.key_iv
data = {k: v for k, v in zip([self.par], [self.param])}
response = requests.post('https://www.aqistudy.cn/historydata/api/historyapi.php', cookies=self.cookies,
headers=self.headers,
data=data)
weather_data = execjs.compile(open('getParams.js', 'r', encoding='utf-8').read())\
.call('decrypt', response.text, a1, a2, a5, a6)
return weather_data['result']['data']
def get_weather_data(self):
"""
获取数据接口
:return: type: dict, 天气数据
"""
self.city = input('请输入城市名称: ')
self.__get_file()
self.__get_params()
self.__calculate_type()
return self.__do_post()
@staticmethod
def multiple_base64_decode(string, count) -> str:
# 解密base64
decoded_string = string
for _ in range(count):
decoded_string = base64.b64decode(decoded_string).decode("utf-8")
return decoded_string
if __name__ == '__main__':
obj = weatherCrawler()
while True:
print(obj.get_weather_data())

View File

@ -0,0 +1,149 @@
import base64
import re
import time
from collections import Counter
import execjs
import requests
def remove_par(pat, string) -> (int, str):
pat = '{}'.format(pat)
count = len(re.findall(pat, string))
result = re.sub(pat, '', string)
return count, result
def multiple_base64_decode(string, count) -> str:
decoded_string = string
for _ in range(count):
decoded_string = base64.b64decode(decoded_string).decode("utf-8")
return decoded_string
def get_re_all(pat, string) -> (list, list):
matches = re.findall(pat, string)
variables = [match[1] for match in matches]
variable_names = [match[0] for match in matches]
return variables, variable_names
def get_re_search(pat, string) -> str:
match = re.search(pat, string)
if match:
key = match.group(1)
return key
else:
return ''
def get_file(city):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Referer': 'https://www.aqistudy.cn/historydata/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
cookies = {
'Hm_lvt_6088e7f72f5a363447d4bafe03026db8': '1689668701',
'Hm_lpvt_6088e7f72f5a363447d4bafe03026db8': str(int(time.time())),
}
params = {
'city': city,
}
response = requests.get('https://www.aqistudy.cn/historydata/monthdata.php', params=params, cookies=cookies,
headers=headers)
match = get_re_search(r'<script[^>]*src="[^"]*\/([^\/?]+)\?t=[^"]+"', response.text)
if match:
filename = 'https://www.aqistudy.cn/historydata/resource/js/' + match
html = requests.get(filename, headers=headers).text
pattern = r'eval\(function\(p,a,c,k,e,d\){.*?}return p}'
_, filtered_html = remove_par(pattern, html)
a = execjs.compile(open('getParams.js', 'r', encoding='utf-8').read()).call('get_enc', filtered_html[1:-3])
count, a = remove_par('eval', a)
if count > 0:
count, a = remove_par('dweklxde', a)
result = a.replace("(", "").replace(")", "").replace("'", '')
data = multiple_base64_decode(result, count)
return data
else:
return a
return None
def get_params(data):
key_iv, key_name = get_re_all(r'const\s*(\w+)\s*=\s*"([^"]+)"', data)
par = get_re_search(r'data:\s*{\s*(\w+)\s*:\s*\w+\s*}', data)
salt = get_re_search(r'var\s*\w+\s*=\s*\'(.*?)\'', data)
return (key_name, key_iv), par, salt
def calculate_type(string, keys: list):
'''
:param string: 得到的动态js
:param keys: 需要统计的密钥次数
:return: 返回加密参数的类型
'''
punctuations = '''!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~'''
translator = str.maketrans(punctuations, ' ' * len(punctuations))
counters = Counter(string.translate(translator).split())
counts = [counters[key] for key in keys]
print(counts)
# 全为1说明参数加密只有base
if counts == [1, 1, 1, 1]:
return 1
elif counts == [1, 1, 2, 2]:
return 2
elif counts == [2, 2, 1, 1]:
return 3
print(string)
def do_post(par_key, value, decrypto_dict):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Referer': 'https://www.aqistudy.cn/historydata/',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
}
cookies = {
'Hm_lvt_6088e7f72f5a363447d4bafe03026db8': '1689668701',
'Hm_lpvt_6088e7f72f5a363447d4bafe03026db8': str(int(time.time())),
}
data = {k: v for k, v in zip([par_key], [value])}
response = requests.post('https://www.aqistudy.cn/historydata/api/historyapi.php', cookies=cookies, headers=headers,
data=data)
decrypto_dict['data'] = response.text
weather_data = execjs.compile(
open('getParams.js', 'r', encoding='utf-8').read()).call(
'decrypt', decrypto_dict['data'], decrypto_dict['a1'], decrypto_dict['a2'],
decrypto_dict['a5'], decrypto_dict['a6']
)
print(weather_data['result']['data'])
if __name__ == '__main__':
citys = ['泸州', '上海', '北京', '杭州', '重庆']
for city in citys:
file = get_file(city)
datas = get_params(file)
a1_name, a2_name, a3_name, a4_name, a5_name, a6_name, a7_name, a8_name = datas[0][0]
a1, a2, a3, a4, a5, a6, a7, a8 = datas[0][1]
par_key = datas[1]
salt = datas[2]
calculate_key = [a3_name, a4_name, a7_name, a8_name]
types = calculate_type(file, calculate_key)
decrypto_dict = {
'a1': a1,
'a2': a2,
'a5': a5,
'a6': a6
}
print(salt)
if types == 1:
value = execjs.compile(open('getParams.js', 'r', encoding='utf-8').read()).call('type1', city, salt)
do_post(par_key, value, decrypto_dict)
elif types == 2:
value = execjs.compile(open('getParams.js', 'r', encoding='utf-8').read()).call('type2', city, salt, a7, a8)
do_post(par_key, value, decrypto_dict)
elif types == 3:
value = execjs.compile(open('getParams.js', 'r', encoding='utf-8').read()).call('type3', city, salt, a3, a4)
do_post(par_key, value, decrypto_dict)

View File

@ -0,0 +1,18 @@
const Crypto = require('crypto-js')
var lastFetchTime = 1674106181275
var r = "BTk5eL+/n9qX/mzrBE6Zs2F0oLJbn4B25d27rF3+D68svjMzlhXYcPegq/9L2aOEB36xL9TPRrAspOkjYkjQ1ghYPVfy7LQ1OXunJc++mfJQzLR7vIBTKJNsiGNDW8o5HSQpJPI3EcFf78dIqfGrl7J07BAmQ5R2RWycjhMlp10ciYuz9cdWRjdmBkQdCef1C/Czy/du+msFps/4/IgqcRLkTuWfuSFQdGaif5TFL3qS7qGBjJU15T04L6oxwbDHcCvI5kI9xv7bdRh8RMwCmV3ZqVY/3Gly55XkIdwHFb5YDUbgpS3NspzX/7fj/k+VC7dAscaqXfZHZSiXhXHWTgirfoUR4RWTSMNu1XbOwERbpfkF5lNQ3YFumIDbDa6JnW4t12aBTiclj9N3+eIpntV8/f+MRo5kwWeV2FqyP6vu8mjVD++Sywo0owLmNoNdNjsE8jInX7FF4z/RLdMzLOZh/2TwJZXiw5c2EBMCM4MKDg+s0yo/RZ9tHX7A1Kq2I5ExByvAFkf64w8p+j5Vz10vRB+3g2bqARNnSwZc2Jtr65IexcFQX2w9/4n/3gzDqZcjpfXBrJ4ZemEwrXJN05DmnG29hVXjPDYk8WvmHOOKTM+V7MuN8MfnOkC9kGYXvYI/gU7jwY+asGwu9sHgQagpz4Z5vy1M5VhaGq71ob4HStLDrlqG521aoT9eBmRN8BcRG0GMpaC1gJtD8wtIM2u7EQi0XPMMAW+WVQIhJSOpLTOfxYttFJUMG9ZyBwU8Lrp+szVXiFvRA3Qt1d2dmZgdXspcbly8PH7pvRZYT5HN1cJSgpHnh+p3qA+Nw3UzMr9zJo0jntr42bLg3ifNka38gjSwybnEpk3/9NNkACNnibWjj91VtaV0ht9fAggDTYAw3oIk8Jsb2EzewPdk5QyeJIo8EWElKK11Y5BCgMYoWFtFzVjpkZdJBzxtFqrXiHi8twUqvkWAF0tr391/Z7zpzBrG1rwaDtXtRHRZksL1itae9ktvs6l260ZgY72MsmzkeCWwVHPQUD8uu1O0T7ij8JdIHxRlfVSWFMiaKtl7ZxsutN3uTWHt9NWl2aZfYDR+Sx05rzr2rIOjSDoInyh9oXZVc7SZLP72ESUIWn00KXCWJ0wyQGXXYtcR1J0joZ9V0NxdvoY1iwgzbDbRPzOi4i7dA/L/UBeC+Hjpo13TjsmMEyr/eaEvWVQCstY471o8+mpv0gnJ8HdRnb6Blfe+yQ+EINlDUMgFh/MkUSkqAFRv4UzwOeiA9X3L9gqjBsGc9y4RX1l8zQklC46qcjz4cyuoTbBsppyMPs0CureZ3EkI2FZdslvj9CJKOIzG6CwrxtfeNdoks9K2cl2d1S/rn2GNvLqsnZSav1V5dONsC0rlYMaT2H/SEF3Yr5RvhCUroym1F0U/E2ZJ7aN1M/HmfdrTFok1ac5TEW3NPqkAYMcZeZ6UKpsDDZrBdGA06QCLlNReJ7MwLuCalKBf0b/JRH1VTWwmnLccfiWhhCP37Go5hZQfxZO3aJbYGuSE/i5sjuiw04xT+YW6hxio5/1PAR9ishJXVktsDITju3BDIuuejAZjkw9N6s+YMr6QUDWOwwPnxD2VNy1JB2B8RLF9ri53u7QLuT0qqYaRTTvcEehpk/WMPQ9FKZztaqOfeP7XGOXQI80977WGzEU4m54oMuMZjT3AkYjoecS4aTNBwTDissuYAaockFhVDun+z8lyM83mIxO0UWHW2AmDPuLkEzHEElsvkYkZxjyuv6jIc7ZTl4gacSbe+FNuSgm3cxXR8S01/kurythhZGsjNSE0EwLcHr32kPKkBc8q+R7Ix7yfSkFT1WuF1FRHvKad3xyLfEyfG507xZEqFfWgHlv8j/Z5PBqK2aPBFlkd+HA3Zk59w7ygGpKzBcYbWdAf1ygeuFJpjW0juR7aXahV8+7JzXyVvFa50jiVygUQe1njjG1UDeWpjYOwWoFxc1oUFKN4oZZgd+skzgUVXY9E6tkZlsG37MTf0rSvsHZV+548vj1jiC1BIQ5t7rDhuRUsmlw5gw11LstnpdfcTUNE2Vz5YltTmVDok5Jk1Uf3jbqn3VhFU2vBgceiF8Ulu2cwZ3e8IQnIkcEEam25oJxckon1UcMtnkTPBZHIXEavWDRZ4pxNcjm+17TTMgjrlEvQdAX7gffAqZB5Ai1t+iBeE5Mm5ARSvBT74GaUrZ1Rvs+fPe0PA63cj1IvRDI5afqBFSIcEAUajn4e7dnRO98nItjhp9gN5hD8DCCN/J28VNoKrVcJ5tasE6BaBv2isv1FKn3AhAaMtM2zmWbzYWvo7/MUk4wV2hriVKHsu9yUceBv+sV2S/63B0GyJ6Fe5vtwNHIXkXS1oTiUNdpvkri3hW7SR2Ai8Yk+bEBQCBAWYN08xRnF9eXJym96fhEUGz0X61Pe3Uvx/m6Ix9WcfEPwQP7UVLzcKoWSf1BbEaW3HQLJJgIpN0SgVtsxsjGbIJ0Sk2Ciuuk8JkBKPCaqgE6E/LEej7sToPzpzFXwBdnWW2H/6L5Ast/AFindK05xwp5n0Q+hZH4KOmnMP/41RmAiHi0IFmPh8lu/7ZRjFvIKoPje+f8DOZatxRYqP7gZH5Y11K/zCb6RXzrZi3j1CuybaBRSZqkwLuwv2CozQAU9ABYneLNCCmwVLi2C7cuHKxdc/OjUJl9VhG7xeik1fRFp70pdAALfjgBdh8y9e4QAjxp1XoKiLomjwewrCHXnYfsVfDYQQhyAmYDCMXVY5k3oHPGViJQfBGMp/baAyxgYDE7Kp3nuEnG04Z+PuG+XiICmadq1vBAaG//2HT5TNE9BrODsKV8EYHbrFXv8gO/iqy7MnnDVYkxOvZeUnyqrE+zyGD3gqerYu9VHieQS2ya2exDHeax45631ihXiGQSvoMRGzdgiE2a5iAF5muggkLrbJ2EGgje4fwqSeo+3cDCvqRQgLe7Nfr5gcMj9KqBsW4ClvpC3/1OvWLIkT1NPKKUoT4QZuv4MjtgHHkgCpgybA1zUv0TOQZqdsJ+Dnd083ZE9DgfDH+D4n+Fcs6Lg92CpnAN7zM79vaR8/gLy0s1S53udpDe/SAkc8up/r/3M3cukmbRyFACxCsWeBXrKDlWAgH0i0ktesYtJoIWJJuQTO0GqJw0E6QgaVK9cCpmNyIVvUih2Imtv+CJv2dkxaoINc8294PnVpXt80UkEWUen2Pvzaefw7UGYOkt80NHX7YiazV89Y6c3PO2/fUwhEy/zyxj04E44YJD7jvVTcnNAnAsMMa+XRtM48ootzKu9utJKlmW6AOr/iewyZCyA0Hf98zi1NhSS3Mfvvao54HBaO0oigG2l4mUCxZwv425eHD0VWafei30AG6t3jTM0rn5YVmwZaLkGeKxO+9QfhPdSS4yfemVVi4nrX+D/VWvDCd+Vkp20wFc2GoEMpRyQg9/TTBHH0hVXmrfhu34nE/0pn2icjk/uRxE1UsmJrs4It0EcURtyZ5ypWYokYgVL5wypV0MxIs84DP9RqZ+KB3Gle+yTdlM87dJvKQZZwXMRfG815JcXJxwwiv1ifGqTtEZfN9rh3dZ7wS/EqXvw7FBf9B6no/4YTw4zBGJWxcpEEQto5W/I4x+xCoschZEvu69Lh1sEtug1Rq3zlIsDA+fXi1N1eR57OD7xcDVOqRclfn/LaRYPLX3Wht17LlifATgnSHWbjU1uUyv2/ksLl4Urthhs5c3J38XfWpm7JIKh910="
function decrypt(r, lastFetchTime){
var i = Crypto.enc.Utf8.parse(lastFetchTime + "000")
, a = Crypto.enc.Utf8.parse(lastFetchTime + "000")
, s = Crypto.AES.decrypt(r.toString(), i, {
iv: a
})
, c = s.toString(Crypto.enc.Utf8)
return c
}
console.log(decrypt(r, lastFetchTime))

View File

@ -0,0 +1,45 @@
import requests
import execjs
cookies = {
'mobile_iindex_uuid': '9f0ae384-2821-5797-8a76-87bb1cef4a1f',
'Hm_lvt_2873e2b0bdd5404c734992cd3ae7253f': '1674101222,1674103567',
'Hm_lpvt_2873e2b0bdd5404c734992cd3ae7253f': '1674103567',
}
headers = {
'authority': 'www.chinaindex.net',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
# 'cookie': 'mobile_iindex_uuid=9f0ae384-2821-5797-8a76-87bb1cef4a1f; Hm_lvt_2873e2b0bdd5404c734992cd3ae7253f=1674101222,1674103567; Hm_lpvt_2873e2b0bdd5404c734992cd3ae7253f=1674103567',
'funcid': 'undefined',
'incognitomode': '0',
'referer': 'https://www.chinaindex.net/ranklist/5/0',
'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'uuid': '9f0ae384-2821-5797-8a76-87bb1cef4a1f',
}
params = {
'keyWord': '李知恩',
'sign': 'b3776cdf6331ee0f6653d1de544291c3'
}
response = requests.get(
'https://www.chinaindex.net/iIndexMobileServer/mobile/comm/getSearchResult',
params=params,
cookies=cookies,
headers=headers,
)
r = response.json()['data']
lastFetchTime = response.json()['lastFetchTime']
ctx = execjs.compile(open('./demo.js', 'r', encoding='utf-8').read()).call('decrypt', r, lastFetchTime)
print(ctx)

View File

@ -0,0 +1,17 @@
// const Crypto = require('crypto-js')
//
//
// var data = 'Z21kD9ZK1ke6ugku2ccWuz4Ip5f4PLCoxWstZf_6UUyBoy8dpWc3NOXFRrnPMya7chcEL7e2Yz1xjFqcfdncOW4vOoJ66RTmRa8-dGZla_ExpWOUP0G1QJFtJ6Gj0ngir07R0ETWttaGO185v5rccLlZKqOCmJuChZSA-Dw9U6B2AOK4-RqYjAQEQ5vF7ph71eC5ZEvV6dm_xv0ywEOKi58R9xWx7fiJytxxlsz-oprAHdRXnI6kWszLLJJpr45DMBjoeArZfVssgWXzX_IlNUvTtj_1o95BpERVvV1FxGEeN-_TLgLaK9j7rjT4O-yPHpbuCk9q1BpLVSh3B4CPWCZPMIHwJiFtfQAC8_t-HWs45DWbW54DEny_doBItZ6v'
// var key = 'ydsecret://query/key/B*RGygVywfNBwpmBaZg*WT7SIOUP2T0C9WHMZN39j^DAdaZhAnxvGcCY6VYFwnHl'
// var iv = 'ydsecret://query/iv/C@lZe2YzHtZ2CYgaXKSVfsb7Y4QWHjITPPZ0nQp87fBeJ!Iv6v^6fvi2WN@bYpJ4'
//
// var ax = [8, 20, 157, 167, 60, 89, 206, 98, 85, 91, 1, 233, 47, 52, 232, 56]
// var b = [210, 187, 27, 253, 232, 59, 56, 195, 68, 54, 99, 87, 183, 156, 174, 28]
let data01 = '08149da73c59ce62555b01e92f34e838'//十六进制
let newdata = Buffer.from(data01,'hex');//先把数据存在buf里面
console.log("newdata ",newdata);
console.log(newdata.toString("utf-8"));//使用toString函数就能转换成字符串

View File

@ -0,0 +1,75 @@
import json
from Crypto.Cipher import AES
import base64
import time
from hashlib import md5
import requests
def sign():
t = int(time.time() * 1000)
n = f'client=fanyideskweb&mysticTime={t}&product=webfanyi&key=fsdsogkndfokasodnaso'
obj = md5()
obj.update(n.encode('utf-8'))
sign = obj.hexdigest()
return sign
def decrypto(data):
key = b'\x08\x14\x9d\xa7\x3c\x59\xce\x62\x55\x5b\x01\xe9\x2f\x34\xe8\x38'
iv = b'\xd2\xbb\x1b\xfd\xe8\x3b\x38\xc3\x44\x36\x63\x57\xb7\x9c\xae\x1c'
aes = AES.new(key, AES.MODE_CBC, iv)
den_text = aes.decrypt(base64.urlsafe_b64decode(data))
return str(den_text, 'utf-8').strip()
def post(w, f, t):
cookies = {
'OUTFOX_SEARCH_USER_ID': '123456789@192.168.60.5',
}
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
# 'Cookie': 'OUTFOX_SEARCH_USER_ID_NCOO=340028215.7799288; OUTFOX_SEARCH_USER_ID=-1551186736@49.52.96.107; P_INFO=18608219667|1670406132|1|youdaonote|00&99|null&null&null#shh&null#10#0|&0||18608219667',
'Origin': 'https://fanyi.youdao.com',
'Referer': 'https://fanyi.youdao.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
data = {
'i': w,
'from': f,
'to': t,
'dictResult': 'true',
'keyid': 'webfanyi',
'sign': sign(),
'client': 'fanyideskweb',
'product': 'webfanyi',
'appVersion': '1.0.0',
'vendor': 'web',
'pointParam': 'client,mysticTime,product',
'mysticTime': str(int(time.time() * 1000)),
'keyfrom': 'fanyi.web',
}
response = requests.post('https://dict.youdao.com/webtranslate', headers=headers, data=data, cookies=cookies)
return response.text
if __name__ == '__main__':
while True:
try:
From = input('请输入开始语言(自动auto, 中文zh-CHS, 韩文ko, 英文en)\n')
To = input('请输入翻译的语言(默认, 中文zh-CHS, 韩文ko, 英文en)\n')
word = input('请输入单词:')
enc = post(word, From, To)
ctx = decrypto(enc)
print(ctx)
except:
print('出现异常,请重新输入!')
continue

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,50 @@
import random
from binascii import hexlify
import base64
from Crypto.Cipher import AES
e = "010001"
g = "0CoJUm6Qyw8W8jud"
f = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
i3x = '{"csrf_token":"","cursor":"1672939386847","offset":"0","orderType":"1","pageNo":"3","pageSize":"20","rid":"R_SO_4_1835283134","threadId":"R_SO_4_1835283134"}'
# 生成随机的16位字符传
def RandomString(a):
string = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
randomStr = random.sample(string, a)
return ''.join(randomStr)
# AES加密算法
def AESEncrypto(text, key):
BS = 16
pad = lambda s: s + (BS - len(s) % BS) * bytes([BS - len(s) % BS])
c = key.encode("utf-8")
d = "0102030405060708".encode("utf-8")
e = text.encode("utf-8")
aes = AES.new(c, AES.MODE_CBC, d)
enc = base64.b64encode(aes.encrypt(pad(e))).decode("utf-8")
return enc
# RSA加密
def RSAEncrypto(text):
text = text[::-1] # 表示文本倒序
result = pow(int(hexlify(text.encode('utf-8')), 16), int(e, 16), int(f, 16))
return format(result, 'x').zfill(131)
def d(text):
i = RandomString(16)
encText = AESEncrypto(text, g)
encText = AESEncrypto(encText, i)
encSecKey = RSAEncrypto(i)
h = {
"encText": encText,
"encSecKey": encSecKey
}
return h

View File

@ -0,0 +1,392 @@
const CryptoJS = require('crypto-js')
const jsdom = require('jsdom') // npm install jsdom
const { JSDOM } = jsdom
const dom = new JSDOM('<!DOCTYPE html><p>Hello World<\p>')
window = dom.window
document = window.document
var maxDigits, ZERO_ARRAY, bigZero, bigOne, dpl10, lr10, hexatrigesimalToChar, hexToChar, highBitMasks, lowBitMasks, biRadixBase = 2, biRadixBits = 16, bitsPerDigit = biRadixBits, biRadix = 65536, biHalfRadix = biRadix >>> 1, biRadixSquared = biRadix * biRadix, maxDigitVal = biRadix - 1, maxInteger = 9999999999999998;
setMaxDigits(20),
dpl10 = 15,
lr10 = biFromNumber(1e15),
hexatrigesimalToChar = ["0","1","2","3","4","5","6","7","8","9","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"],
hexToChar = ["0","1","2","3","4","5","6","7","8","9","a","b","c","d","e","f"],
highBitMasks = [0,32768,49152,57344,61440,63488,64512,65024,65280,65408,65472,65504,65520,65528,65532,65534,65535],
lowBitMasks = [0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535];
var xx = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
i3x = {
csrf_token: "",
cursor: "1672939386847",
offset: "0",
orderType: "1",
pageNo: "3",
pageSize: "20",
rid: "R_SO_4_1835283134",
threadId:"R_SO_4_1835283134",
}
var bMr1x = d(JSON.stringify(i3x), '010001', xx, '0CoJUm6Qyw8W8jud');
function a(a) {
var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";
for (d = 0; a > d; d += 1)
e = Math.random() * b.length,
e = Math.floor(e),
c += b.charAt(e);
return c
}
function b(a, b) {
var c = CryptoJS.enc.Utf8.parse(b)
, d = CryptoJS.enc.Utf8.parse("0102030405060708")
, e = CryptoJS.enc.Utf8.parse(a)
, f = CryptoJS.AES.encrypt(e, c, {
iv: d,
mode: CryptoJS.mode.CBC
});
return f.toString()
}
function setMaxDigits(a) {
maxDigits = a,
ZERO_ARRAY = new Array(maxDigits);
for (var b = 0; b < ZERO_ARRAY.length; b++)
ZERO_ARRAY[b] = 0;
bigZero = new BigInt,
bigOne = new BigInt,
bigOne.digits[0] = 1
}
function BigInt(a) {
this.digits = "boolean" == typeof a && 1 == a ? null : ZERO_ARRAY.slice(0),
this.isNeg = !1
}
function c(a, b, c) {
var d, e;
return setMaxDigits(131),
d = new RSAKeyPair(b,"",c),
e = encryptedString(d, a)
}
function reverseStr(a) {
var c, b = "";
for (c = a.length - 1; c > -1; --c)
b += a.charAt(c);
return b
}
function digitToHex(a) {
var b = 15
, c = "";
for (i = 0; 4 > i; ++i)
c += hexToChar[a & b],
a >>>= 4;
return reverseStr(c)
}
function biToHex(a) {
var d, b = "";
for (biHighIndex(a),
d = biHighIndex(a); d > -1; --d)
b += digitToHex(a.digits[d]);
return b
}
function biModuloByRadixPower(a, b) {
var c = new BigInt;
return arrayCopy(a.digits, 0, c.digits, 0, b),
c
}
function biDivideByRadixPower(a, b) {
var c = new BigInt;
return arrayCopy(a.digits, b, c.digits, 0, c.digits.length - b),
c
}
function biMultiply(a, b) {
var d, h, i, k, c = new BigInt, e = biHighIndex(a), f = biHighIndex(b);
for (k = 0; f >= k; ++k) {
for (d = 0,
i = k,
j = 0; e >= j; ++j,
++i)
h = c.digits[i] + a.digits[j] * b.digits[k] + d,
c.digits[i] = h & maxDigitVal,
d = h >>> biRadixBits;
c.digits[k + e + 1] = d
}
return c.isNeg = a.isNeg != b.isNeg,
c
}
function encryptedString(a, b) {
for (var f, g, h, i, j, k, l, c = new Array, d = b.length, e = 0; d > e; )
c[e] = b.charCodeAt(e),
e++;
for (; 0 != c.length % a.chunkSize; )
c[e++] = 0;
for (f = c.length,
g = "",
e = 0; f > e; e += a.chunkSize) {
for (j = new BigInt,
h = 0,
i = e; i < e + a.chunkSize; ++h)
j.digits[h] = c[i++],
j.digits[h] += c[i++] << 8;
k = a.barrett.powMod(j, a.e),
l = 16 == a.radix ? biToHex(k) : biToString(k, a.radix),
g += l + " "
}
return g.substring(0, g.length - 1)
}
function BarrettMu_modulo(a) {
var i, b = biDivideByRadixPower(a, this.k - 1), c = biMultiply(b, this.mu), d = biDivideByRadixPower(c, this.k + 1), e = biModuloByRadixPower(a, this.k + 1), f = biMultiply(d, this.modulus), g = biModuloByRadixPower(f, this.k + 1), h = biSubtract(e, g);
for (h.isNeg && (h = biAdd(h, this.bkplus1)),
i = biCompare(h, this.modulus) >= 0; i; )
h = biSubtract(h, this.modulus),
i = biCompare(h, this.modulus) >= 0;
return h
}
function BarrettMu_multiplyMod(a, b) {
var c = biMultiply(a, b);
return this.modulo(c)
}
function BarrettMu_powMod(a, b) {
var d, e, c = new BigInt;
for (c.digits[0] = 1,
d = a,
e = b; ; ) {
if (0 != (1 & e.digits[0]) && (c = this.multiplyMod(c, d)),
e = biShiftRight(e, 1),
0 == e.digits[0] && 0 == biHighIndex(e))
break;
d = this.multiplyMod(d, d)
}
return c
}
function biShiftRight(a, b) {
var e, f, g, h, c = Math.floor(b / bitsPerDigit), d = new BigInt;
for (arrayCopy(a.digits, c, d.digits, 0, a.digits.length - c),
e = b % bitsPerDigit,
f = bitsPerDigit - e,
g = 0,
h = g + 1; g < d.digits.length - 1; ++g,
++h)
d.digits[g] = d.digits[g] >>> e | (d.digits[h] & lowBitMasks[e]) << f;
return d.digits[d.digits.length - 1] >>>= e,
d.isNeg = a.isNeg,
d
}
function biMultiplyDigit(a, b) {
var c, d, e, f;
for (result = new BigInt,
c = biHighIndex(a),
d = 0,
f = 0; c >= f; ++f)
e = result.digits[f] + a.digits[f] * b + d,
result.digits[f] = e & maxDigitVal,
d = e >>> biRadixBits;
return result.digits[1 + c] = d,
result
}
function biSubtract(a, b) {
var c, d, e, f;
if (a.isNeg != b.isNeg)
b.isNeg = !b.isNeg,
c = biAdd(a, b),
b.isNeg = !b.isNeg;
else {
for (c = new BigInt,
e = 0,
f = 0; f < a.digits.length; ++f)
d = a.digits[f] - b.digits[f] + e,
c.digits[f] = 65535 & d,
c.digits[f] < 0 && (c.digits[f] += biRadix),
e = 0 - Number(0 > d);
if (-1 == e) {
for (e = 0,
f = 0; f < a.digits.length; ++f)
d = 0 - c.digits[f] + e,
c.digits[f] = 65535 & d,
c.digits[f] < 0 && (c.digits[f] += biRadix),
e = 0 - Number(0 > d);
c.isNeg = !a.isNeg
} else
c.isNeg = a.isNeg
}
return c
}
function biCompare(a, b) {
if (a.isNeg != b.isNeg)
return 1 - 2 * Number(a.isNeg);
for (var c = a.digits.length - 1; c >= 0; --c)
if (a.digits[c] != b.digits[c])
return a.isNeg ? 1 - 2 * Number(a.digits[c] > b.digits[c]) : 1 - 2 * Number(a.digits[c] < b.digits[c]);
return 0
}
function biMultiplyByRadixPower(a, b) {
var c = new BigInt;
return arrayCopy(a.digits, 0, c.digits, b, c.digits.length - b),
c
}
function arrayCopy(a, b, c, d, e) {
var g, h, f = Math.min(b + e, a.length);
for (g = b,
h = d; f > g; ++g,
++h)
c[h] = a[g]
}
function biShiftLeft(a, b) {
var e, f, g, h, c = Math.floor(b / bitsPerDigit), d = new BigInt;
for (arrayCopy(a.digits, 0, d.digits, c, d.digits.length - c),
e = b % bitsPerDigit,
f = bitsPerDigit - e,
g = d.digits.length - 1,
h = g - 1; g > 0; --g,
--h)
d.digits[g] = d.digits[g] << e & maxDigitVal | (d.digits[h] & highBitMasks[e]) >>> f;
return d.digits[0] = d.digits[g] << e & maxDigitVal,
d.isNeg = a.isNeg,
d
}
function biNumBits(a) {
var e, b = biHighIndex(a), c = a.digits[b], d = (b + 1) * bitsPerDigit;
for (e = d; e > d - bitsPerDigit && 0 == (32768 & c); --e)
c <<= 1;
return e
}
function biDivideModulo(a, b) {
var f, g, h, i, j, k, l, m, n, o, p, q, r, s, c = biNumBits(a), d = biNumBits(b), e = b.isNeg;
if (d > c)
return a.isNeg ? (f = biCopy(bigOne),
f.isNeg = !b.isNeg,
a.isNeg = !1,
b.isNeg = !1,
g = biSubtract(b, a),
a.isNeg = !0,
b.isNeg = e) : (f = new BigInt,
g = biCopy(a)),
new Array(f,g);
for (f = new BigInt,
g = a,
h = Math.ceil(d / bitsPerDigit) - 1,
i = 0; b.digits[h] < biHalfRadix; )
b = biShiftLeft(b, 1),
++i,
++d,
h = Math.ceil(d / bitsPerDigit) - 1;
for (g = biShiftLeft(g, i),
c += i,
j = Math.ceil(c / bitsPerDigit) - 1,
k = biMultiplyByRadixPower(b, j - h); -1 != biCompare(g, k); )
++f.digits[j - h],
g = biSubtract(g, k);
for (l = j; l > h; --l) {
for (m = l >= g.digits.length ? 0 : g.digits[l],
n = l - 1 >= g.digits.length ? 0 : g.digits[l - 1],
o = l - 2 >= g.digits.length ? 0 : g.digits[l - 2],
p = h >= b.digits.length ? 0 : b.digits[h],
q = h - 1 >= b.digits.length ? 0 : b.digits[h - 1],
f.digits[l - h - 1] = m == p ? maxDigitVal : Math.floor((m * biRadix + n) / p),
r = f.digits[l - h - 1] * (p * biRadix + q),
s = m * biRadixSquared + (n * biRadix + o); r > s; )
--f.digits[l - h - 1],
r = f.digits[l - h - 1] * (p * biRadix | q),
s = m * biRadix * biRadix + (n * biRadix + o);
k = biMultiplyByRadixPower(b, l - h - 1),
g = biSubtract(g, biMultiplyDigit(k, f.digits[l - h - 1])),
g.isNeg && (g = biAdd(g, k),
--f.digits[l - h - 1])
}
return g = biShiftRight(g, i),
f.isNeg = a.isNeg != e,
a.isNeg && (f = e ? biAdd(f, bigOne) : biSubtract(f, bigOne),
b = biShiftRight(b, i),
g = biSubtract(b, g)),
0 == g.digits[0] && 0 == biHighIndex(g) && (g.isNeg = !1),
new Array(f,g)
}
function md(d, e, f, g) {
d = JSON.stringify(d)
var h = {}
, i = a(16);
return h.encText = b(d, g),
h.encText = b(h.encText, i),
h.encSecKey = c(i, e, f),
h
}
function d(d, e, f, g) {
var h = {}
, i = a(16);
return h.encText = b(d, g),
h.encText = b(h.encText, i),
h.encSecKey = c(i, e, f),
h
}
function biDivide(a, b) {
return biDivideModulo(a, b)[0]
}
function charToHex(a) {
var h, b = 48, c = b + 9, d = 97, e = d + 25, f = 65, g = 90;
return h = a >= b && c >= a ? a - b : a >= f && g >= a ? 10 + a - f : a >= d && e >= a ? 10 + a - d : 0
}
function biFromNumber(a) {
var c, b = new BigInt;
for (b.isNeg = 0 > a,
a = Math.abs(a),
c = 0; a > 0; )
b.digits[c++] = a & maxDigitVal,
a >>= biRadixBits;
return b
}
function RSAKeyPair(a, b, c) {
this.e = biFromHex(a),
this.d = biFromHex(b),
this.m = biFromHex(c),
this.chunkSize = 2 * biHighIndex(this.m),
this.radix = 16,
this.barrett = new BarrettMu(this.m)
}
function biFromHex(a) {
var d, e, b = new BigInt, c = a.length;
for (d = c,
e = 0; d > 0; d -= 4,
++e)
b.digits[e] = hexToDigit(a.substr(Math.max(d - 4, 0), Math.min(d, 4)));
return b
}
function hexToDigit(a) {
var d, b = 0, c = Math.min(a.length, 4);
for (d = 0; c > d; ++d)
b <<= 4,
b |= charToHex(a.charCodeAt(d));
return b
}
function biHighIndex(a) {
for (var b = a.digits.length - 1; b > 0 && 0 == a.digits[b]; )
--b;
return b
}
function BarrettMu(a) {
this.modulus = biCopy(a),
this.k = biHighIndex(this.modulus) + 1;
var b = new BigInt;
b.digits[2 * this.k] = 1,
this.mu = biDivide(b, this.modulus),
this.bkplus1 = new BigInt,
this.bkplus1.digits[this.k + 1] = 1,
this.modulo = BarrettMu_modulo,
this.multiplyMod = BarrettMu_multiplyMod,
this.powMod = BarrettMu_powMod
}
function biCopy(a) {
var b = new BigInt(!0);
return b.digits = a.digits.slice(0),
b.isNeg = a.isNeg,
b
}
ax = {
csrf_token: "d5e1f281f7b6f7ff2caf0af810f347d6",
lencodeType: "aac",
ids: "[010014984FD3A423D7EC79009184DDA27700]",
level: "standard"
}
// console.log(d(JSON.stringify(ax), '010001', '8c979a9a86a6e4b3c1de07b6f93bd8d4', '0CoJUm6Qyw8W8jud'))
console.log(bMr1x)
'{"rid":"R_SO_4_%d","threadId":"R_SO_4_1297486027","pageNo":"1","pageSize":"20","cursor":"-1","offset":"0","orderType":"1","csrf_token":"ee74402ef50d2a957bccb7b540f4bc27"}'
'{"rid":"R_VI_62_4096A8D2343DB13036C15EDE76355DE9","threadId":"R_VI_62_4096A8D2343DB13036C15EDE76355DE9","pageNo":"1","pageSize":"20","cursor":"-1","offset":"0","orderType":"1","csrf_token":"ee74402ef50d2a957bccb7b540f4bc27"}'

View File

@ -0,0 +1,394 @@
import execjs
import requests
from tqdm import tqdm
import csv
import os
xx = '010001'
yy = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
zz = '0CoJUm6Qyw8W8jud'
cookies = {
'MUSIC_U': '',
}
headers = {
'authority': 'music.163.com',
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
# 'cookie': 'P_INFO=18608219667|1659842485|1|study|00&99|null&null&null#sic&510500#10#0|&0||18608219667; __bid_n=184855102dc3deb72f4207; FPTOKEN=72X4JbpCu5OusTIYuHrCSGgjkuVlm7w0d2wuZhf+LU7tCP8d2MdyrBKXbvhWmMBQ4PKKvkagbJ51CrfoDP1FK92ujmpUvUgInBgFojtQGyau97Tpz4WSuTnUiS6fizwTsHkskf4I2RdqrQCBQFHtYEtOnpD8RQadQBQbKXjwRw/nR1DO+23Y6vcDZbzGXAtLl6Xm4RhhE95S1srhGEVyjbIKGwnyHfiAJRmy6s7aRwJy06lrHyqXmRGsl75msfYuOSPVdoqKR50yZOaIXkE9+reLCp71sfWzH6IyIuEd0tOfp2DIGQOXRwPNsfJIVhnzOmQETOjXTciGSjjqjpcB1HvV6MJEPoJTz9jtA9xEAbpdqzfdbXWd1t66tiWvMdSwOdRJufrVWLb5Kp45jXCEMg==|4i8C4fx+LJpM3RJOqHD5D/gktdXKtOZsPzv+3ONA2vU=|10|4784ce007f1eaa4073c5660cfcf93bfa; vinfo_n_f_l_n3=0f5eba99d02a8a90.1.12.1673163354239.1673496764827.1673499976392; _iuqxldmzr_=32; _ntes_nnid=f37f7a44b589883a8947dd6fca21229a,1674451825593; _ntes_nuid=f37f7a44b589883a8947dd6fca21229a; NMTID=00OpWJ6K25R-OBw305HoKc7iw1Bum4AAAGF3Rs0ZA; WEVNSM=1.0.0; WNMCID=pozalx.1674451825917.01.0; WM_TID=%2BrY9QzhJ44xAQUUBAFKBIufmFp9POPoJ; WM_NI=FZifNkYxQ5%2BsOc7UcO0iL2%2BysJb4NBTGZYVM84rxk4hET0mDURlUNWjbwIRjhuX5QLHgQRO1zicH%2BhhGxyGw5XoZKrhco9d3otC6cYq4jWsQGVO9ozzTlzitaHjcs4mocU0%3D; WM_NIKE=9ca17ae2e6ffcda170e2e6eeb0cf6f898e0096fc4eada88fb2d54f838a9fadc147edb99d98bc46bab7a9abe52af0fea7c3b92a8686a787cd619ba89fd6aa798dbdf9b6e245f4b784d3aa73b196aed5cd4a989996bad547fcecbe99ea40ac9b9cd1f050978ea4b6cd5daabeb9aad162acab98d6d4488cb99fb3ce3bbab7ada9aa64ac99b89ac67df788a1bbca7aacbda1dae739adac8dd7e253b3eca6b8f25b858a81daee72ba8ca49bec80a398a684f240afb1adb6b337e2a3; playerid=49334466; JSESSIONID-WYYY=o%5CyijWemMcjioZJ1fsF%5C9sl57SgrkFa6o9yMUd4dnNshvo11uAMNJOV%5CbIJYe0VSo2xpDI0mc%2FUvQX2xQpe1U2JCJDfEwIvO%2FhMo%5CGHCcfZI3r%2BzseRcOnnrt8NsZwR53VNvUYHNp6sYzWTZEiYbYJ9D4%2Fv%2BIUCnGPv3mypr1JBIDaev%3A1674532481499',
'origin': 'https://music.163.com',
'referer': 'https://music.163.com/',
'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
params = {
'csrf_token':
}
def replace_lastChar(former_str, replacechar):
return former_str[:-1] + replacechar
def returnError(response):
if response.json()['result'] == {}:
print('搜索的内容无法查找到!')
return 0
def downloader(url, i3x, id):
param = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('d', i3x, xx, yy, zz)
data = {
'params': param['encText'],
'encSecKey': param['encSecKey']
}
response = requests.post(
url,
params=params,
cookies=cookies,
headers=headers,
data=data,
)
try:
url = response.json()['data']['url']
except:
url = response.json()['urls'][0]['url']
res = requests.get(url, stream=True)
content_size = int(res.headers['Content-Length']) / 1024
print('正在下载MV。。。')
with open(f'./MV/{id}.mp4', 'wb') as fp:
for data in tqdm(iterable=res.iter_content(1024),
total=content_size,
unit='k'):
fp.write(data)
print('下载完成!')
def searchLoader(i3x, name, offset):
i3x = format(i3x % (name, offset))
param = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('d', i3x, xx, yy, zz)
data = {
'params': param['encText'],
'encSecKey': param['encSecKey']
}
response = requests.post(
'https://music.163.com/weapi/cloudsearch/get/web',
params=params,
cookies=cookies,
headers=headers,
data=data,
)
return response
def Post(time, id, i, key):
if key == 1:
i3x = {
'csrf_token': "d5e1f281f7b6f7ff2caf0af810f347d7",
'cursor': str(time),
'offset': "0",
'orderType': "2",
'pageNo': f"{i}",
'pageSize': "20",
'rid': f"R_SO_4_{id}",
'threadId': f"R_SO_4_{id}",
}
elif key == 2:
if len(id) == 32:
i3x = {
'csrf_token': "d5e1f281f7b6f7ff2caf0af810f347d7",
'cursor': str(time),
'offset': "0",
'orderType': "1",
'pageNo': f"{i}",
'pageSize': "20",
'rid': f"R_VI_62_{id}",
'threadId': f"R_VI_62_{id}",
}
else:
i3x = {
'csrf_token': "d5e1f281f7b6f7ff2caf0af810f347d7",
'cursor': str(time),
'offset': "0",
'orderType': "1",
'pageNo': f"{i}",
'pageSize': "20",
'rid': f"R_MV_5_{id}",
'threadId': f"R_MV_5_{id}",
}
param = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('md', i3x, xx, yy, zz)
data = {
'params': param['encText'],
'encSecKey': param['encSecKey']
}
response = requests.post('https://music.163.com/weapi/comment/resource/comments/get', params=params,
headers=headers, data=data, cookies=cookies)
return response.json()
def getCursor(datas):
userInfos = datas["data"]["comments"]
if len(userInfos) < 20:
return 'end'
userInfo = userInfos[19]['time']
return userInfo
def comment(datas):
userInfos = datas["data"]["comments"]
infos = []
userInfo = {}
for info in userInfos:
info1 = info["user"]
if info1["vipRights"] is not None:
userInfo = {
'评论人的ID': info1['userId'],
'评论人网名': info1['nickname'],
'评论人VIP等级': info1["vipRights"]['redVipLevel'],
'评论人头像网址': info1['avatarUrl'],
'评论编号': info['commentId'],
'评论时间': info['timeStr'],
'点赞量': info['likedCount'],
'评论内容': info['content'],
'ip地址': info["ipLocation"]['location']
}
else:
userInfo['评论人VIP等级'] = 0
infos.append(userInfo)
return infos
def get_comment(endNum, id, key):
header = ['评论人的ID', '评论人网名', '评论人VIP等级', '评论人头像网址', '评论编号', '评论时间', '点赞量',
'评论内容', 'ip地址']
fp = open(f'./comment/comment_of_{id}.csv', 'w', encoding='utf-8', newline='')
writer = csv.DictWriter(fp, header)
writer.writeheader()
id = str(id)
i = 1
data = Post(-1, id, i, key)
while True:
time = getCursor(data)
info = comment(data)
writer.writerows(info)
print('正在下载第', i, '页的评论')
if time == 'end' or i == endNum:
break
i += 1
data = Post(time, id, i, key)
fp.close()
def get_lyric(id):
i3x = '{"id":"%d","lv":-1,"tv":-1,"csrf_token":"d5e1f281f7b6f7ff2caf0af810f347d7"}'
i3x = format(i3x % id)
param = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('d', i3x, xx, yy, zz)
data = {
'params': param['encText'],
'encSecKey': param['encSecKey']
}
response = requests.post('https://music.163.com/weapi/song/lyric', params=params, cookies=cookies, headers=headers,
data=data)
res = response.json()["lrc"]['lyric']
try:
resp = response.json()["tlyric"]['lyric']
except:
resp = ''
with open(f'./lyric/lyric_of_{id}.txt', 'w', encoding='utf-8') as fp:
fp.write(res)
if resp != '':
fp.write('译文如下所示\n')
fp.write(resp)
def get_musicUrl(mid):
i3x = '{"ids":"[%d]","level":"lossless","encodeType":"aac","csrf_token":"d5e1f281f7b6f7ff2caf0af810f347d7"}'
i3x = format(i3x % mid)
param = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('d', i3x, xx, yy, zz)
data = {
'params': param['encText'],
'encSecKey': param['encSecKey']
}
response = requests.post(
'https://music.163.com/weapi/song/enhance/player/url/v1',
params=params,
cookies=cookies,
headers=headers,
data=data,
)
Url = response.json()["data"][0]['url']
if Url is None:
return 'error'
return Url
def GetMusic(url, mid):
if url != 'error':
response = requests.get(
url=url,
headers=headers,
stream=True
)
if str(response) == '<Response [403]>':
new_url = replace_lastChar(url, 'r')
response = requests.get(
url=new_url,
headers=headers,
stream=True
)
content_size = int(response.headers['Content-Length']) / 1024
print('正在下载歌曲。。。')
with open(f'./music/{mid}.mp3', 'wb') as fp:
for data in tqdm(iterable=response.iter_content(1024),
total=content_size,
unit='k'):
fp.write(data)
print('下载完成!')
else:
print('歌曲暂无音源或需要购买专辑才能下载')
def searchSong(name, offset):
i3x = '{"hlpretag":"<span class=\\"s-fc7\\">","hlposttag":"</span>","id":"160947","s":"%s","type":"1","offset":"%d","total":"true","limit":"30","csrf_token":"ee74402ef50d2a957bccb7b540f4bc27"}'
response = searchLoader(i3x, name, offset)
if returnError(response) == 0:
return 0
total = response.json()["result"]['songCount']
songs = response.json()['result']['songs']
print('搜索到的结果有', total, '')
for song in songs:
songInfo = {
'歌曲id': song['id'],
'歌曲名称': song['name'],
'歌手姓名': song['ar'][0]['name'],
'专辑名称': song['al']['name'],
'mvid': song['mv']
}
print(songInfo)
return total
def searchFunction(flag):
name = input('请输入想要搜索的内容:\n')
offset = 0
while True:
if flag == '1':
total = searchSong(name, offset)
elif flag == '2':
total = searchMV(name, offset)
else:
print('错误的输入,请按提示进行输入')
break
offset += 30
if offset > total:
print('已超出上线,退出搜索功能\n')
break
keys = input(f'是否继续搜索,输入任意字符继续搜索{name}下一页的内容退出请输入0\n')
if keys == '0':
break
def downloadFunction(flag):
if flag == '3':
id = input('请输入想要下载的歌曲id\n')
id = int(id)
while True:
keys = input('请输入想要进行的操作: 1.下载歌曲 2.下载歌词 3.下载评论 0.退出\n')
if keys == '1':
print('重复下载将会覆盖之前下载的文件')
url = get_musicUrl(id)
GetMusic(url, id)
elif keys == '2':
print('重复下载将会覆盖之前下载的文件')
get_lyric(id)
elif keys == '3':
print('重复下载将会覆盖之前下载的文件')
num = input('请输入下载评论的页数\n')
get_comment(int(num), id, 1)
elif keys == '0':
break
else:
print('错误输入,请重新输入')
continue
elif flag == '4':
id = input('请输入想要下载的MV的id:\n')
while True:
keys = input('请输入想要进行的操作: 1.下载MV 2.下载评论 0.退出\n')
if keys == '1':
print('重复下载将会覆盖之前下载的文件')
downloadMV(id)
elif keys == '2':
print('重复下载将会覆盖之前下载的文件')
num = input('请输入下载评论的页数\n')
get_comment(int(num), id, 2)
elif keys == '0':
break
else:
print('错误输入,请重新输入')
continue
def searchMV(name, offset):
i3x = '{"hlpretag":"<span class=\\"s-fc7\\">","hlposttag":"</span>","id":"160947","s":"%s","type":"1014","offset":"%d","total":"true","limit":"20","csrf_token":"ee74402ef50d2a957bccb7b540f4bc27"}'
response = searchLoader(i3x, name, offset)
if returnError(response) == 0:
return 0
total = response.json()["result"]['videoCount']
videos = response.json()['result']['videos']
print('搜索到的结果有', total, '')
for video in videos:
MVInfo = {
'MV的id': video['vid'],
'MV名称': video['title'],
}
print(MVInfo)
return total
def downloadMV(id):
if len(id) == 32:
i3x = '{"ids":"[\\"%s\\"]","resolution":"1080","csrf_token":"ee74402ef50d2a957bccb7b540f4bc27"}'
i3x = format(i3x % id)
url = 'https://music.163.com/weapi/cloudvideo/playurl'
downloader(url, i3x, id)
else:
id = int(id)
i3x = '{"id":"%d","r":"1080","csrf_token":"ee74402ef50d2a957bccb7b540f4bc27"}'
i3x = format(i3x % id)
url = 'https://music.163.com/weapi/song/enhance/play/mv/url'
downloader(url, i3x, id)
def checkDir():
if not os.path.exists('./music'):
os.mkdir('./music')
if not os.path.exists('./MV'):
os.mkdir('./MV')
if not os.path.exists('./lyric'):
os.mkdir('./lyric')
if not os.path.exists('./comment'):
os.mkdir('./comment')
if __name__ == '__main__':
checkDir()
while True:
print('欢迎使用音乐下载器')
key = input('请输入想要选择的功能: 1.搜索歌曲 2.搜索MV 3.下载歌曲 4.下载MV 0.退出程序\n')
if key == '1':
searchFunction(key)
elif key == '2':
searchFunction(key)
elif key == '3':
downloadFunction(key)
elif key == '4':
downloadFunction(key)
elif key == '0':
break
else:
print('错误输入,请重新输入')
continue

50
lxml&re/4k图片爬取.py Normal file
View File

@ -0,0 +1,50 @@
import requests
from lxml import etree
import urllib3 # 禁用安全请求警告,当目标使用htpps时使用
import os
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# 解决爬取网页出现中文乱码的情况
def rebuilt_Language(url, headers):
response = requests.get(url=url, headers=headers, verify=False)
# response.encoding = response.apparent_encoding
return response
if __name__ == "__main__":
# UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
# 建立一个文件夹存储照片
i = -1
if not os.path.exists('./picLibs'):
os.mkdir('./picLibs')
# 设置一个通用的url
url = 'https://pic.netbian.com/4kmeinv/index_%d.html'
pageNum = 1
src_list = [] # 存储图片的src
img_name_list = [] # 存储图片的名字
for pageNum in range(1, 3):
new_url = format(url % pageNum)
page_text = rebuilt_Language(url=new_url, headers=headers).text
tree = etree.HTML(page_text)
# 解析src的属性值解析alt属性值
li_list = tree.xpath('//div[@class="wrap clearfix"]//li')
for li in li_list:
src = ' https://pic.netbian.com' + li.xpath('./a/img/@src')[0]
src_list.append(src)
img_name = li.xpath('./a/img/@alt')[0] + '.jpg'
# 解决中文乱码的方法
img_name = img_name.encode('iso-8859-1').decode('gbk')
img_name_list.append(img_name)
# 请求图片并持续化存储
for img_url in src_list:
i = i + 1
img_data = requests.get(url=img_url, headers=headers).content
img_path = 'picLibs/' + img_name_list[i]
with open(img_path, 'wb') as fp:
fp.write(img_data)
print(img_name_list[i] + '下载成功!')

View File

@ -0,0 +1,21 @@
import requests
from lxml import etree
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
# 设置一个通用的url
url = 'https://sh.58.com/ershoufang/p%d/?PGTID=0d30000c-0000-2e04-d18a-9af183e2d6a4&ClickID=1'
pageNum = 1
fp = open('58.txt', 'w', encoding='utf-8')
for pageNum in range(1, 9):
new_url = format(url % pageNum) # 拼接成完整的url
page_text = requests.get(url=new_url, headers=headers).text
tree = etree.HTML(page_text)
tongji_list = tree.xpath('//section[@class="list"]/div')
for li in tongji_list:
title = li.xpath('./a/div[2]//h3/text()')[0]
print(title)
fp.write(title + '\n')
print('over!')

28
lxml&re/GetFakeUA.py Normal file
View File

@ -0,0 +1,28 @@
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
url = 'https://useragentstring.com/pages/useragentstring.php?name=Chrome'
resp = requests.get(url=url, headers=headers).text
tree = etree.HTML(resp)
ul_list = tree.xpath('//*[@id="liste"]/ul')
USER_AGENT = []
fp = open('./fake_UA.txt', 'a', encoding='utf-8')
for ul in ul_list:
UA = ul.xpath('./li/a/text()')
for i in range(1, len(UA)):
ua = '"' + UA[i] + '",\n'
print(ua)
fp.write(ua)
fp.close()

35
lxml&re/bs4案例.py Normal file
View File

@ -0,0 +1,35 @@
import requests
from bs4 import BeautifulSoup
import urllib3 # 禁用安全请求警告,当目标使用htpps时使用
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# 解决爬取网页出现中文乱码的情况
def rebuilt_Language(url, headers):
response = requests.get(url=url, headers=headers, verify=False)
response.encoding = response.apparent_encoding
return response
# 爬取三国演义小说所有的章节标题和章节内容
if __name__ == "__main__":
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
page_text = rebuilt_Language(url, headers).text
# 创建BeautifulSoup对象
soup = BeautifulSoup(page_text, 'lxml')
li_list = soup.select('.book-mulu > ul >li')
fp = open('./sanguo.txt', 'w', encoding='utf-8')
for li in li_list:
title = li.a.string
detail_url = 'https://www.shicimingju.com' + li.a['href']
detail_page_text = rebuilt_Language(detail_url, headers).text
# 解析详情页相关章节内容
detail_soup = BeautifulSoup(detail_page_text, 'lxml')
div_tag = detail_soup.find('div', class_='chapter_content')
content = div_tag.text
fp.write(title + ":" + content + "\n")
print(title, '爬取成功')

View File

@ -0,0 +1,18 @@
import requests
from bs4 import BeautifulSoup
if __name__ == "__main__":
# 将本地的html文件中的数据加载到该对象中
fp = open('./test.html', 'r', encoding='utf-8')
soup = BeautifulSoup(fp, 'lxml')
# print(soup)
# print(soup.a) # soup.tagName 返回的是html中第一次出现的tagName标签
# print(soup.find('div')) # 相当于soup.div
# print(soup.find('div', class_='song'))
# print(soup.find_all('a'))
# print(soup.select('.tang'))
# print(soup.select('.tang > ul > li > a')[0])
# print(soup.select('.tang > ul a')[0])
# print(soup.select('.tang > ul a')[0].text)
tag = soup.find('div', class_='song')
print(tag.text)

35
lxml&re/test.html Normal file
View File

@ -0,0 +1,35 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>测试bs4</title>
</head>
<body>
<div>
<p>百里守约</p>
</div>
<div class = "song">
<p>李清照</p>
<p>王安石</p>
<p>苏轼</p>
<p>柳宗元</p>
<a href="https://www.song.com/" title="赵匡胤" target="_self">
<span>this is span</span>
宋朝是最强大的王朝,不是军队的强大,而是经济很强大,国民都很有钱
</a>
<a href="" class="du">总为浮云能避日,长安不见使人愁</a>
<img src="https://www.baidu.com/meinv.jpg" alt="" />
</div>
<div class="tang">
<ul>
<li><a href="https://www.baidu.com" title="qing">清明时节雨纷纷,路上行人欲断魂</a> </li>
<li><a href="https://www.163.com" title="qin">秦时明月</a> </li>
<li><a href="https://www.sina.com" class="du">杜甫</a> </li>
<li><a href="https://www.dudu.com" class="du">杜牧</a> </li>
<li><b>杜小咪</b></li>
<li><i>度蜜月</i></li>
<li><a href="https://www.shu.edu.cn" id="feng">凤凰台上凤凰游,风趣太空江自流</a> </li>
</ul>
</div>
</body>
</html>

13
lxml&re/xpath基础.py Normal file
View File

@ -0,0 +1,13 @@
from lxml import etree
if __name__ == "__main__":
# 实例化一个etree对象
tree = etree.parse('./test.html')
# r = tree.xpath('/html//title')
# r = tree.xpath('//div[@class="song"]')
# r = tree.xpath('//div[@class="song"]/p[3]')
# r = tree.xpath('//div[@class="tang"]/ul/li[4]/a/text()')
# r = tree.xpath('//div[@class="tang"]//text()')
# r = tree.xpath('//div[@class="song"]/img/@src')
r = tree.xpath('//div[@class="song"]/p/text()')
print(r)

32
lxml&re/正则练习.py Normal file
View File

@ -0,0 +1,32 @@
import re
# 提取出python
key = "java python c++ php"
s = re.findall('python', key)[0]
print(s)
# key = 'https://scpic.chinaz.net/files/default/imgs/2023-01-04/610de886ffc6b37d_s.jpg'
# s = re.sub('_s', '', key)
# print(s)
# 提取出hello world
# key = "<html><h1>hello world<h1><html>"
# s = re.findall('<h1>(.*)<h1>', key)[0]
# print(s)
# 提取出170
# string = '我喜欢身高为170的女生'
# s = re.findall('\d+', string)[0]
# print(s)
# 提取出http:// 和 https://
# key = 'http://www.baidu.com and https://dong.com'
# s = re.findall('https?://', key)
# print(s)
# 提取出hello
# key = 'lalala<hTml>hello</HTMl>hahaha'
# s = re.findall('<[Hh][Tt][mM][lL]>(.*)</[Hh][Tt][mM][Ll]>', key)
# print(s)
# 提取出hit.
# key = 'bobo@hit.edu.cn'
# s = re.findall('h.*?\.', key)[0]
# print(s)
# 提取出saas 和 sas
# key = 'saas and sas and saaas'
# s = re.findall('sa{1,2}s', key)
# print(s)

28
lxml&re/正则解析.py Normal file
View File

@ -0,0 +1,28 @@
import re
import requests
import os
# 爬取图片
if __name__ == "__main__":
# 创建一个文件夹,用来保存所有的图片
if not os.path.exists('./imgLibs'):
os.mkdir('./imgLibs')
url = 'https://www.douban.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
# 使用通用爬虫对url对应的一整张页面进行爬取
page_text = requests.get(url=url, headers=headers).text
# 使用聚焦爬虫将页面中所有的图片进行解析、提取
ex = '<div class="pic">.*?<img src=.*? data-origin="(.*?)" alt=.*?</div>'
img_src_list = re.findall(ex, page_text, re.S)
# print(img_src_list)
for src in img_src_list:
# 将图片信息以二进制存储
img_data = requests.get(url=src, headers=headers).content
# 生成图片名称
img_name = src.split('/')[-1]
imgPath = './imgLibs/' + img_name
with open(imgPath, 'wb') as fp:
fp.write(img_data)
print(img_name, '下载成功')

51
lxml&re/简历爬取.py Normal file
View File

@ -0,0 +1,51 @@
import requests
from lxml import etree
import os
if __name__ == '__main__':
# UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
url0 = 'https://sc.chinaz.com/jianli/free.html' # 访问第一页的链接这里因为直接用free_1无法打开网页
url = 'https://sc.chinaz.com/jianli/free_%d.html'
pageNum = 1
download_list = []
download_name_list = []
# 新建文件夹可持续化存储
if not os.path.exists('./CV_moban'):
os.mkdir('./CV_moban')
# 分页爬取
for pageNum in range(1, 3):
if pageNum == 1:
new_url = url0
else:
new_url = format(url % pageNum)
# 实例化对象的构建
page_text = requests.get(url=new_url, headers=headers).text
tree = etree.HTML(page_text)
# 爬取需要下载的页面信息
CV_infor_list = tree.xpath('//div[@class="main_list jl_main"]/div')
for cv in CV_infor_list:
CV_src = cv.xpath('./a/@href')[0]
CV_text = requests.get(url=CV_src, headers=headers).text
ctree = etree.HTML(CV_text)
# 爬取简历下载链接
download_src = ctree.xpath('//div[@class="down_wrap"]/div[2]/ul/li/a/@href')[0]
download_list.append(download_src)
# 爬取简历名称
download_name = ctree.xpath('//div[@class="bgwhite"]/div//h1/text()')[0]
download_name = download_name.encode('iso-8859-1').decode('utf-8') + '.rar'
download_name_list.append(download_name)
# 批量下载简历模板
i = -1
for cvv in download_list:
i = i + 1
cvv = download_list[i]
cv_content = requests.get(url=cvv, headers=headers).content
cv_path = 'CV_moban/' + download_name_list[i]
with open(cv_path, 'wb') as fp:
fp.write(cv_content)
print(download_name_list[i] + '下载完成!')

451
note.txt Normal file
View File

@ -0,0 +1,451 @@
爬虫在使用场景中的分类:
- 通用爬虫
抓取系统重要组成部分。抓取的是一整张页面的数据。
- 聚焦爬虫
是建立在通用爬虫的基础之上。抓取的是页面中特定的局部内容。
- 增量式爬虫
检测网站中数据更新的情况。只会抓取网站中最新更新出来的数据。
爬虫的矛与盾:
- 反爬机制
门户网站,可以通过制定相应的策略或者技术手段,防止爬虫程序进行网站数据的爬取。
- 反反爬策略
爬虫程序可以通过制定相关的策略或者技术手段,破解门户网站中具备的反爬机制,从而获取门户网站中的相关数据。
http协议
- 概念:服务器与客户端进行数据交互的一种形式。
- 常用请求头信息:
1User-Agent请求载体的身份标识。
2Connection请求完毕后是断开连接还是保持连接。
- 常见响应头信息:
1Content-Type服务器响应回客户端的数据类型
https协议
- 概念:安全的超文本传输协议
- 加密方式
1对称密钥加密
2非对称密钥加密
3证书密钥加密
requests模块python中原生的一款基于网络请求的模块功能强大简单便捷效率极高。
- 作用:模拟浏览器发请求
- 如何使用
1指定url
2发起请求Get or Post
3获取响应数据
4持久化存储
动态加载数据:
网页信息可能是动态加载的ajax动态请求可以在XHR中查看真正url
数据解析:
- 正则
- bs4
- xpath***
数据解析原理:
大部分文本内容都存在标签中或者标签对应的属性中
进行指定标签定位
标签或标签对应的属性中存储的数据值进行提取(解析)
聚焦爬虫编码流程:
- 指定url
- 发起请求
- 获取响应数据
- 数据解析
- 持久化存储
正则表达式:(import re)
- 单字符:
. : 除换行以外的所有字符
[] : [aoe] [a-w] 匹配集合任意一个字符
\d : 数字 [0-9]
\D : 非数字
\w : 数字,字谜,下划线,中文
\W : 非\w
\s : 所有的空白字符,包括空格,制表符,换页符等等。等价于 [\f\n\r\t\v]
\S : 非空白
- 数量修饰:
* : 任意多次 >=0
+ : 至少依次 >=1
? : 可有可无0次或1次
{m} : 固定m次 hello{3, }
{m,} : 至少m次
{m,n} : m-n次
- 边界:
$ : 以某某结尾
^ : 以某某开头
- 分组:
(ab)
- 贪婪模式: .*
- 非贪婪模式: .*?
- re.I: 忽略大小写
- re.M: 多行匹配
- re.S: 单行匹配
- re.sub(正则表达式,替换内容, 字符串)
<div class="pic">
<a href="https://www.douban.com/photos/album/1727324287/">
<img src="https://img1.doubanio.com/view/photo/albumcover/public/p2578730628.webp"
data-origin="https://img1.doubanio.com/view/photo/albumcover/public/p2578730628.webp" alt="">
</a>
</div>
ex = '<div class="pic">.*?<img src=.*? data-origin="(.*?)" alt=.*?</div>'
bs4进行数据解析
- 数据解析原理:
标签定位
提取标签、标签属性的数据值
- bs4数据解析原理
1.实例化一个BeautifulSoup对象并将页面源码数据加载到该对象中
2.通过调用BeautifulSoup对象中相关属性或者方法进行标签定位和数据提取
- 进行环境的安装:
pip install bs4
pip install lxml
- 如何实例化BeautifulSoup对象
1.from bs4 import BeautifulSoup
2.对象的实例化
- 将本地的html文档中的数据加载到该对象中
fp = open('./test.html', 'r', encoding='utf-8')
soup = BeautifulSoup(fp, 'lxml')
- 将互联网上获取的页面源码加载到该对象中
page_text = response.text
soup = BeautifulSoup(page_text, 'lxml')
- 提供的用于数据解析的方法和属性:
soup.tagName: 返回的是文档中出现的第一个tagName标签
soup.find()
- soup.find('tagName'): 相当于soup.tagName
- soup.find('tagName', class_='song'): 属性定位
- soup.find_all('tagName'): 返回符合要求的所有标签,是一个列表
soup.select():
- select('某种选择器'): 返回的是一个列表
- 层级选择器:
soup.select('.tang > ul > li > a') > 表示一个层级
soup.select('.tang > ul a') ' '空格表示多个层级
- 获取标签中的文本数据
soup.a.text/.strong/.get_text():
text/get_text(): 可以获取某一个标签中的所有文本内容
string: 只可以获取该标签下面直系的文本内容
- 获取标签中属性值
soup.a['属性名']
xpath解析 最常用且最便捷高效的一种解析方式,通用性强。
- 原理1.实例化一个etree对象且需要将被解析的页面源码数据加载到该对象中。
2.调用etree对象中的xpath方法结合着xpath表达式实现标签的定位和内容的捕获。
- 环境的安装:
pip install lxml
- 如何实例化etree对象: from lxml import etree
将本地的html文档中的数据加载到该对象中
etree.parse(filePath)
将互联网上获取的页面源码加载到该对象中
etree.HTML(page_text)
- xpath("xpath表达式")
/: 表示从根节点开始定位,一个/表示一个层级.
//: 表示多个层级(可以表示从任意位置定位)
属性定位: //div[@class="song"] tag[@attrNAme="attrValue"]
索引定位: tree.xpath('//div[@class="song"]/p[3]') 索引是从1开始的
取文本:/text() 只能取到标签的直系文本 //text() 可以取到标签中非直系的文本内容(所有文本内容)
取属性:/@attrName ==>img/@src
局部解析: title = li.xpath('./a/div[2]//h3/text()')[0] 一定要加"."
验证码识别:
- 识别验证码操作:
- 人工肉眼识别。(不推荐)
- 第三方自动识别。(推荐)
- ddddocr库
import ddddocr
ocr = ddddocr.DdddOcr()
with open('1.png', 'rb') as f:
img_bytes = f.read()
res = ocr.classification(img_bytes)
print(res)
模拟登录:
- 爬取基于某些用户的用户信息。
- 点击登录按钮之后可能会发起一个post请求
- post请求中会携带相关的用户信息用户名密码验证码....
- 页面会更新_VIEWSTATE 页面隐藏域和__VIEWSTATEGENERATOR 页面隐藏域时,我们需要对这个数据也进行爬取
viewstate = tree.xpath("//input[@id='__VIEWSTATE']/@value")[0]
viewstategenerator = tree.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value")[0]
EVENTVALIDATION = tree.xpath("//input[@id='__EVENTVALIDATION']/@value")
- 我们一次只能用requests发一次请求之后再需要发请求时用Session(),将请求包装成一个对象,这样就不会导致访问失败
session = requests.Session()
code_data = session.get(url=code_img_src, headers=headers).content
http/https协议特性 无状态
- 发起的第二次基于个人页面请求的时候,服务器端并不知道此请求是基于登录状态下的请求
- cookie 用来让服务器端记录客户端的相关状态
- 自动处理: cookie值的来源 登陆时post请求中携带有cookie值
session会话对象
- 作用:
- 可以进行请求的发送
- 如果请求过程中产生了cookie则该cookie会被自动存储携带在该session对象中
代理: 破解封IP这种反爬机制
什么是代理:
- 代理服务器
代理的作用:
- 突破自身ip访问的限制
- 可以隐藏自身真实的ip
代理IP的匿名度
- 透明服务器知道使用了代理知道真实ip
- 匿名服务器知道使用了代理不知道真实IP
- 高匿:服务器不知道使用了代理
高性能异步爬虫:
- 异步爬虫的方式:
- 多线程,多进程(不建议): 无法无限制的开启多线程和多进程。
- 进程池、线程池:池的容量是有上线的。
- 单线程 + 异步协程(推荐):
- event_loop: 事件循环,相当于一个无限循环,我们可以把一些函数注册到这个事件循环上,当满足某些条件时,函数就会被循环执行。
- coroutine: 协程对象我们可以将协程对象注册到事件循环中它会被事件循环调用。我们可以使用async关键字来定义一个方法这个方法在调用的时候不会立即执行而是返回一个协程对象。
- task: 任务,它是对协程对象的进一步封装,包含了任务的各个状态。
- future: 代表将要执行或还没有执行的任务实际上和task没有本质区别。
- async: 定义一个协程。
- await: 用来挂起阻塞方法的执行。
selenium模块的基本使用
- 下载selenium pip install selenium
- 下载一个浏览器驱动程序(谷歌)
- 下载路径http://chromedriver.storage.googleapis.com/index.html
- 实例化一个浏览器对象
- 编写基于浏览器自动化操作代码
- 发起请求: get(url)
- 标签定位: find系列方法
- 标签交互: send_keys('xxx')
- 执行js程序: execute_script('jsCode')
- 前进,后退: back(),forward()
- 关闭浏览器: quit()
- selenium处理iframe
- 如果定位的标签存在iframe标签之中则必须使用switch_to.iframe(id)
- 动作链: from selenium.webdriver import ActionChains
- 实例化一个动作链对象: action = ActionChains(bro)
- 执行操作: action.click_and_hold(div) action.move_by_offset(17, 0).perform()
- 释放对象: action.release()
scrapy框架
- 什么是框架?
- 集成了很多功能并且具有很强通用性的一个项目模板。
- 如何学习框架?
- 专门学习框架封装的各种功能的详细用法。
- 什么是scrapy?
- 爬虫中封装好的一个明星框架。
- 功能: 高性能的持久化存储,异步的数据下载,高性能的数据解析操作,分布式
- 基本使用
- 环境安装: mac/linux: pip install scrapy
: windows: pip install wheel
下载twisted下载地址为: http://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
下载twisted: pip install Twisted-20.3.0-cp39-cp39-win_amd64.whl
pip install pywin32
pip install scrapy
- 创建一个工程: scrapy startproject xxxPro
- cd xxxPro
- 在spiders子目录中创建一个爬虫文件
- scrapy genspider spiderName www.xxx.com
- 执行: scrapy crawl spiderName(scrapy crawl test --nolog 采用无日志信息输出,但是这样不好,我们使用接下来的方法)
- 在配置文件中添加: LOG_LEVEL = 'ERROR' 表示只输出错误信息
- 修改user-agent: USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
- scrapy数据解析
- 在parse(self, response)函数中进行编写详细操作方法可见qiubai案例
- scrapy持久化存储
- 基于终端命令的持久化存储:
- 要求: 只可以将parse方法的返回值存储到本地的文本文件中
- 注意持久化存储的文件类型只可以为 'json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle'
- scrapy crawl xxx -o filePath
- 优点: 简介高效便捷
- 缺点: 局限性比较强(数据只可以存储到指定文件后缀的文本文件中)
- 基于管道持久化存储:
- 数据解析
- 在item类中定义相关的属性
- 将解析到的数据封装到item类型的对象
- 将item类型的对象提交给管道进行持久化存储
- 在管道类的process_item中要将其接受到的item对象中存储的数据进行持久化存储操作
- 在配置文件中开启管道
- 好处:通用性强
- 爬虫文件提交的item类型对象最终会提交给哪一个管道类
- 先执行的管道类
面试题: 将爬取到的数据一份存储到本地一份存储到数据库,如何实现?
- 管道文件中一个管道类对应的是将数据存储到一种平台
- 爬虫文件提交的item只会给管道文件种第一个被执行的管道类接受
- process_item中的return item表示item将会被传递给下一个即将被执行的管道类
基于spider的全站数据爬取
- 就是将网站下的某板块下的全部页码对应的页面数据进行爬取
- 爬取笑话网的段子数据
- 将所有页面的url添加到start_urls列表(不推荐)
- 自行手动进行请求发送
scrapy五大核心部件
- 管道
- 与引擎交互,之后持久化存储
- spider
- 爬虫文件中的爬虫类
- 调度器
- 过滤器: 过滤重复的请求对象
- 队列: 存放请求对象
- 下载器
- 与互联网通信,数据下载
- 引擎
- 接收队列以及发送response
- 所有类都要经过引擎
- 核心作用: 用作数据流处理以及触发事件
请求传参
- 使用场景: 爬取解析的数据不在同一张页面中。(深度爬取)
- 需求: 爬取boss直聘的岗位名称岗位描述
图片数据爬取之ImagePipeline
- 基于scrapy爬取字符串类型数据和图片类型数据的区域
- 字符串: 只需要基于xpath解析且提交管道进行持久化存储
- 图片: 我们只能解析到图片地址,之后我们要单独对图片地址发起请求获取图片二进制类型数据
- 基于ImagePipeline:
- 只需要将img的src属性值提交给管道管道就会对图片的src进行请求发送获取图片的二进制类型的数据且还会进行持久化存储
- 需求: 爬取站长素材中的高清图片
中间件
- 下载中间件
- 位置: 引擎和下载器之间
- 作用: 批量拦截到整个工程中所有的请求和响应
- 拦截请求:
- 请求头信息(UA伪装, 代理ip)
- 拦截响应:
- 篡改响应数据,响应对象
CrawlSpider: 类Spider的一个子类
- 全站数据爬取的方式
- 基于spider实现: 手动请求发送
- 基于CrawlSpider实现
- CrawlSpider的使用:
- 创建一个工程
- 创建爬虫文件: scrapy genspider -t crawl xxx www.xxx.com
- 链接提取器
- 作用: 根据指定规则(allow=r'正则表达式')进行指定链接提取
- 规则解析器
- 作用: 将链接提取器提取到的链接进行指定规则(callback)的解析操作
分布式爬虫
- 概念: 我们需要搭建一个分布式机群,让其对一组资源进行分布联合爬取
- 作用: 提示爬取数据的效率
- 如何实现分布式?
- 安装一个scrapy-redis的组件
- 原生的scrapy是不可以实现分布式爬虫必须要让scrapy结合这scrapy-redis组件一起实现分布式爬虫
- 为什么原生的scrapy不可以实现分布式爬虫
- 不同电脑的scrapy调度器不能共享
- 管道不可以被分布式机群共享
- scrapy-redis组件的作用:
- 可以给原生的scrapy框架提供可以被共享的管道和调度器
- 实现流程
- 创建一个工程
- 创建一个基于crawlspider的爬虫文件
- 修改当前的爬虫文件:
- from scrapy_redis.spiders import RedisCrawlSpider
- 将start_url allowed_domain进行注释
- redis_key = 'name' 可以被共享的调度器队列的名称
- 编写相关的数据解析操作
- 将当前爬虫类的父类修改成RedisCrawlSpider
- 修改配置文件
- 指定可以被共享的管道:
- ITEM_PIPELINES = {
'scrapy_redis.pipelines.RedisPipeLine': 400,
}
- 指定调度器
- DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDuperFilter'
- SCHEDULER = 'scrapy_redis.scheduler.Scheduler'
- SCHEDULER_PERSIST = True
- 指定redis服务器
- REDIS_HOST = '127.0.0.1' # 服务器ip
- REDIS_PORT = 6379
- redis相关操作配置:
- 配置redis的配置文件:
- linux或者mac: redis.conf
- windows: redis.windows.conf
- 打开配置文件修改:
- 注释绑定: # bind 127.0.0.1
- 关闭保护模式: protected-mode no
- 结合配置文件启动redis数据库
- redis-server 配置文件
- ./redis-cli
- 执行工程
- scrapy runspider xxx.py
- 向调度器的队列中放入起始url
- 调度器的队列在redis客户端中
- lpush xxx www.xxx.com
增量式爬虫
- 概念: 检测网站数据更新情况,只会爬取网站最新更新出来的数据
- 分析:
- 起始url
- 基于CrawlSpider获取其他页面链接
- 基于Rule将其他页码链接进行请求
- 从每一个页面对应的页面源码中解析出每一部电影详情页的url
- 核心: 检测详情页的url之前有没有请求过
- 将爬取过的电影详情页url存储
- 存储到redis的set数据库
conn = Redis(host='127.0.0.1', port=6379)
ex = self.conn.sadd('urls', detail_url)
if ex == 1:
print('该url没有被爬取过可以进行数据爬取'
yield scrapy.Request(detail_url, callback=self.parse_detail)
else:
print('数据还没有更新,暂无新数据!')
class PipeLine(object):
conn = None
def open_spider(self, spider):
self.conn = spider.conn
def process_item(self, item, spider):
dic = {
'name': item['name'],
'desc': item['desc'],
}
print(dic)
self.conn.lpush('movieData', dic)
return item
- 对详情页的url发起请求然后解析出电影的名称和时间
- 进行持久化存储
scrapy_splash
- 使用scrapy_splash最终拿到的response相当于在浏览器全部渲染完成之后的网页页面
- 作用: 模拟浏览器加载js并返回js运行后的数据
- 安装环境:
- 安装docker
- sudo docker pull scrapinghub/splash
- 尝试运行镜像:
- 在前台运行: sudo docker run -p 8050:8050 scrapinghub/splash
- 在后台运行: sudo docker run -d -p 8050:8050 scrapinghub/splash
JS逆向
- 数据加密
- 看到的是一堆密文
- 请求头加密
- 表单加密
- 模拟生成规则,在被加密前是什么内容
- 参数加密
- cookie加密
- 通常是在浏览器有正确地响应但是爬虫返回的是一堆js代码或者非正常的响应
Web逆向技巧
- 爬虫的接口定位
- 字体加密Unicode编码数据加密
- 无混淆的js
- 关键字搜索
- 解密搜decrypt
- 加密搜encrypt
- ajax渲染搜JSON.parse JSON.parse函数或者方法密文a = 函数或者方法(密文)|JSON.parse(a)
- 搜接口自带的关键字(特点:方法或者函数包裹密文数据)
- xhr断点
- 路径搜索
- 跟栈
- hook
- 反debug: 内存写入变量
- 注入: 控制台注入 本地替换
1. html -- lxml -- re
2. json -- 如何提取键值以及组装成自己想要的样子

16
request/01-Request.py Normal file
View File

@ -0,0 +1,16 @@
# requests模块的使用
import requests
if __name__ == "__main__":
# 指定url
url = 'https://wz.sun0769.com/political/index/politicsNewest'
# 发起请求
# get方法会返回一个响应对象
response = requests.get(url=url)
# 获取响应数据
page_txt = response.text
# 持久化存储
with open('./sogou.html', 'w', encoding='utf-8') as fp:
fp.write(page_txt)
print('爬取数据结束!')

View File

@ -0,0 +1,24 @@
# UA检测反爬机制门户网站的服务器会检测对应请求的载体身份标识如果检测到请求的载体身份为某一浏览器说明该请求是一个正常请求。
# 但是如果检测到不是某一浏览器,则表示该请求为非正常请求。服务器端拒绝该次请求。
# UAUser-Agent请求载体的身份标识
# UA伪装让爬虫身份标识伪装成浏览器
import requests
if __name__ == '__main__':
# UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
url = 'https://www.sogou.com/web?'
# 处理url携带的参数封装到字典中
kw = input('enter a word:')
param = {
'query': kw,
}
# 对指定的url发起的请求对应的url是携带参数的并且请求过程中处理了参数
response = requests.get(url=url, params=param, headers=headers)
page_text = response.text
fileName = kw+ '.html'
with open(fileName, 'w', encoding='utf-8') as fp:
fp.write(page_text)
print(fileName, '保存成功!')

View File

@ -0,0 +1,26 @@
# post请求携带了参数
# 响应数据是一组json数据
import requests
import json
if __name__ == '__main__':
# 指定url
post_url = 'https://fanyi.baidu.com/sug'
# UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
# post请求参数处理
word = input('enter a word:')
data = {
'kw': word
}
# 请求发送
response = requests.post(url=post_url, data=data, headers=headers)
# 获取响应数据:json方法返回的是obj如果确认响应数据是json类型的才可以使用jason
dic_obj = response.json()
# 持久化存储
filename = word + '.json'
fp = open(filename, 'w', encoding='utf-8')
json.dump(dic_obj, fp=fp, ensure_ascii=False)
print('over!!')

View File

@ -0,0 +1,23 @@
import requests
import json
if __name__ == '__main__':
# 指定url
url = 'https://movie.douban.com/j/chart/top_list'
param = {
'type': '24',
'interval_id': '100:90',
'action': '',
'start': '1',
'limit': '20',
}
# UA伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
response = requests.get(url=url, params=param, headers=headers)
list_data = response.json()
print(list_data)
fp = open('./douban.json', 'w', encoding='utf-8')
json.dump(list_data, fp=fp, ensure_ascii=False)
print('Over!!')

View File

@ -0,0 +1,27 @@
import requests
import json
if __name__ == '__main__':
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
place = input('enter a place:')
page = 1 # 从第1页开始
fileName = place + 'KFC餐厅位置信息' + '.json'
for i in range(0, 20): # 设置一个较大参数直到爬完所有页码
param = {
'cname': '',
'pid': '',
'keyword': place, # 查询地点
'pageIndex': page, # 查询页码
'pageSize': '10', # 每页最多显示10个
}
response = requests.post(url=url, params=param, headers=headers)
page_text = response.text
# print(page_text)
with open(fileName, 'a', encoding='utf-8') as fp:
json.dump(page_text, fp=fp, ensure_ascii=False)
fp.write('\n') # 注意这里还是在for循环当中每爬取完一页内容就敲个回车
page = page + 1 # 佛如循环的循环变量注意前文默认为1
print('over!!!')

View File

View File

@ -0,0 +1,15 @@
import random
COOKIE_LIST = [
'wd_guid=544d13f9-f072-4fdc-9989-84452f1ecd52; historyState=state; _bl_uid=XtlO5cqLjv05qpj3t0d0nna8msI4; lastCity=101020100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1673095377,1673165470,1673257271,1673333037; boss_login_mode=sms; __fid=c58f56b0daac21ec5273e9b4b258f537; wt2=DY4IX_Pe18l5jPqD0AYgnA-G9UnTNtDaZ_zMhCpK7UovHjn5bKxYiZ6NtwTrfsFzsgpxFtIBCopvwd7HdvXTGrg~~; wbg=0; __zp_stoken__=887aefCE3dDAxC0wecFokLmdqeARKZz80V3cWbnglEDsONSs%2FVCMzL295aWdxVWw6Ry4PehcuLyROcX4mdTpZXyFXVEtiREADYGooaVQmYhwcSUtZVAQoNVpLLXZRQkdxBRc9G0QGUFhyNA0%3D; geek_zp_token=V1RN0kEOL031ZiVtRvyB4bLCuw6zrQxCo~; __l=l=%2Fwww.zhipin.com%2Fshanghai%2F&r=&g=&s=3&friend_source=0&s=3&friend_source=0; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1673349533; __c=1673333037; __a=68265253.1672926940.1673257271.1673333037.431.9.106.431'
]
def cookie_dic():
cookie_string = random.choice(COOKIE_LIST)
cookie_dict = {}
for kv in cookie_string.split(';'):
k = kv.split('=')[0]
v = kv.split('=')[1]
cookie_dict[k] = v
return cookie_dict

View File

@ -0,0 +1,454 @@
import random
import requests
USER_AGENTS = [
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1290.1 Safari/537.13",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1290.1 Safari/537.13",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_4) AppleWebKit/537.13 (KHTML, like Gecko) Chrome/24.0.1290.1 Safari/537.13",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.6 Safari/537.11",
"Mozilla/5.0 (Windows NT 6.0) yi; AppleWebKit/345667.12221 (KHTML, like Gecko) Chrome/23.0.1271.26 Safari/453667.1221",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.45 Safari/535.19",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.45 Safari/535.19",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_8) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.04 Chromium/17.0.963.65 Chrome/17.0.963.65 Safari/535.11",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/10.10 Chromium/17.0.963.65 Chrome/17.0.963.65 Safari/535.11",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/17.0.963.65 Chrome/17.0.963.65 Safari/535.11",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11",
"Mozilla/5.0 (X11; FreeBSD amd64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_4) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.65 Safari/535.11",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.75 Safari/535.7",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.8",
"Mozilla/5.0 (Windows NT 5.2; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7",
"Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.861.0 Safari/535.2",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.861.0 Safari/535.2",
"Chrome/15.0.860.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/15.0.860.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.814.0 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.813.0 Safari/535.1",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.810.0 Safari/535.1",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/10.04 Chromium/14.0.808.0 Chrome/14.0.808.0 Safari/535.1",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/14.0.803.0 Chrome/14.0.803.0 Safari/535.1",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.803.0 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.801.0 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.794.0 Safari/535.1",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.792.0 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.790.0 Safari/535.1",
"Mozilla/5.0 ArchLinux (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Ubuntu/11.04 Chromium/13.0.782.41 Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (Windows NT 5.2; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_3) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.220 Safari/535.1",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.215 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.215 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.215 Safari/535.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.20 Safari/535.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.20 Safari/535.1",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.20 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.107 Safari/535.1",
"Mozilla/5.0 (X11; Linux amd64) AppleWebKit/534.36 (KHTML, like Gecko) Chrome/13.0.766.0 Safari/534.36",
"Mozilla/5.0 (X11; CrOS i686 12.0.742.91) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.93 Safari/534.30",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.91 Chromium/12.0.742.91 Safari/534.30",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/10.10 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/10.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/11.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/10.10 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.30 (KHTML, like Gecko) Ubuntu/10.04 Chromium/12.0.742.112 Chrome/12.0.742.112 Safari/534.30",
"Mozilla/5.0 (Windows NT 7.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.112 Safari/534.30",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.112 Safari/534.30",
"Mozilla/5.0 (Windows 8) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.112 Safari/534.30",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_6) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.112 Safari/534.30",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_4) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.112 Safari/534.30",
"Mozilla/5.0 ArchLinux (X11; U; Linux x86_64; en-US) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.100",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.30 (KHTML, like Gecko) Slackware/Chrome/12.0.742.100 Safari/534.30",
"Mozilla/5.0 (X11; Linux i686) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.100 Safari/534.30",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.100 Safari/534.30",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.100 Safari/534.30",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_4) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.100 Safari/534.30",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/12.0.702.0 Safari/534.24",
"Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.699.0 Safari/534.24",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_7) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.68 Safari/534.24",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_5_8) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.68 Safari/534.24",
"Mozilla/5.0 (Windows NT 6.0; WOW64) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.34 Safari/534.24",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.3 Safari/534.24",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.3 Safari/534.24",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_6) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.12 Safari/534.24",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/534.24 (KHTML, like Gecko) Chrome/11.0.696.0 Safari/534.24",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_7_0; en-US) AppleWebKit/534.21 (KHTML, like Gecko) Chrome/11.0.678.0 Safari/534.21",
"Mozilla/5.0 (Windows NT) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.18 (KHTML, like Gecko) Chrome/11.0.661.0 Safari/534.18",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/11.0.655.0 Safari/534.17",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.17 (KHTML, like Gecko) Chrome/10.0.649.0 Safari/534.17",
"Mozilla/5.0 (X11; U; FreeBSD x86_64; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.204 Safari/534.16",
"Mozilla/5.0 (X11; U; FreeBSD i386; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.204 Safari/534.16",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.204",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.134 Safari/534.16",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.134 Safari/534.16",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.134 Safari/534.16",
"Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Ubuntu/10.10 Chromium/10.0.648.133 Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
"Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.127 Safari/534.16",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.127 Safari/534.16",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.127 Safari/534.16",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU; AppleWebKit/534.16; KHTML; like Gecko; Chrome/10.0.648.11;Safari/534.16)",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; ru-RU) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.11 Safari/534.16",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Ubuntu/10.10 Chromium/10.0.648.0 Chrome/10.0.648.0 Safari/534.16",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.0 Safari/534.16",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.634.0 Safari/534.16",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.15 (KHTML, like Gecko) Ubuntu/10.10 Chromium/10.0.613.0 Chrome/10.0.613.0 Safari/534.15",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
"Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/540.0 (KHTML, like Gecko) Ubuntu/10.10 Chrome/9.1.0.0 Safari/540.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.600.0 Safari/534.14",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.15 Safari/534.13",
"Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.107 Safari/534.13 v1416748405.3871",
"Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.107 Safari/534.13 v1416670950.695",
"Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.107 Safari/534.13 v1416664997.4379",
"Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.107 Safari/534.13 v1333515017.9196",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.0 Safari/534.13",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.0 Safari/534.13",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.0 Safari/534.13",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_5; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.0 Safari/534.13",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.0 Safari/534.13",
"Mozilla/5.0 (X11; U; CrOS i686 0.9.128; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.339",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/533.3",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/534.10",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.215 Safari/534.10",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.215 Safari/534.10",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; de-DE) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/7.0.540.0 Safari/534.10",
"Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
"Mozilla/5.0 (ipad Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.6 (KHTML, like Gecko) Chrome/7.0.498.0 Safari/534.6",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.464.0 Safari/534.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.462.0 Safari/534.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.461.0 Safari/534.3",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.460.0 Safari/534.3",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.458.1 Safari/534.3",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.458.1 Safari/534.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.458.1 Safari/534.3",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.458.1 Safari/534.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.458.0 Safari/534.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/534.2 (KHTML, like Gecko) Chrome/6.0.454.0 Safari/534.2",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-US) AppleWebKit/534.2 (KHTML, like Gecko) Chrome/6.0.453.1 Safari/534.2",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/534.2 (KHTML, like Gecko) Chrome/6.0.453.1 Safari/534.2",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-GB) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-US) AppleWebKit/534.1 (KHTML, like Gecko) Chrome/6.0.428.0 Safari/534.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.86 Safari/533.4",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_0; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.86 Safari/533.4",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; fr-FR) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.126 Safari/533.4",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.366.0 Safari/533.4",
"Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/5.0.358.0 Safari/533.3",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/5.0.358.0 Safari/533.3",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/5.0.354.0 Safari/533.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/5.0.353.0 Safari/533.3",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.343.0 Safari/533.2",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.7 Safari/533.2",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.3 Safari/533.2",
"Mozilla/5.0 (X11; U; Linux i586; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.1 Safari/533.2",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.2 (KHTML, like Gecko) Chrome/5.0.342.1 Safari/533.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.16 (KHTML, like Gecko) Chrome/5.0.335.0 Safari/533.16",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_8; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; de-DE) Chrome/4.0.223.3 Safari/532.2",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.2 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.1 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.223.1 Safari/532.2",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.6 Safari/532.2",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.5 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.5 Safari/532.2",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.5 Safari/532.2",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.4 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.4 Safari/532.2",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.4 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.3 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.3 Safari/532.2",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.2 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.12 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.222.12 Safari/532.2",
"Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.8 Safari/532.2",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.8 Safari/532.2",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.8 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.6 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.2 (KHTML, like Gecko) Chrome/4.0.221.6 Safari/532.2",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.5 Safari/532.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.3 Safari/532.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.219.3 Safari/532.1",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.1 Safari/532.1",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.0 Safari/532.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.0 Safari/532.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.213.0 Safari/532.1",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.212.1 Safari/532.1",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.1 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.1",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.212.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.7 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.4 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.2 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.211.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.210.0 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.209.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.209.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.209.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.208.0 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.207.0 Safari/532.0",
"Mozilla/5.0 (X11; U; FreeBSD i386; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.207.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.207.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.207.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.207.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.207.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.206.1 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.206.1 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.206.1 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.206.1 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.206.1 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.206.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.206.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.206.0 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.204.0 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.203.2 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.203.2 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.203.2 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.203.2 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.203.2 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.203.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.203.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.203.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.203.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.203.0 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0 (x86_64); de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; de-DE) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.2 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/525.13.",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.202.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.201.1 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.198.1 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.198.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.198.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.198.0 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_8; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.198 Safari/532.0",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_7; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.198 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.197.11 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.197.11 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.197.11 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.197.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.197.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.196.2 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.196.2 Safari/532.0",
"Mozilla/5.0 (X11; U; Linux i686 (x86_64); en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.196.0 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.6 Safari/532.0",
"Mozilla/4.0 (Windows; U; Windows NT 5.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.33 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.3 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML,like Gecko) Chrome/3.0.195.27",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.27 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.24 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.21 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.20 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.17 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.10 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/3.0.195.1 Safari/532.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.4 (KHTML, like Gecko) Chrome/3.0.194.0 Safari/531.4",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/531.3 (KHTML, like Gecko) Chrome/3.0.193.2 Safari/531.3",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/531.0 (KHTML, like Gecko) Chrome/3.0.191.0 Safari/531.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/531.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/530.0 (KHTML, like Gecko) Chrome/2.0.182.0 Safari/531.0",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.177.0 Safari/530.7",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.176.0 Safari/530.7",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.7 (KHTML, like Gecko) Chrome/2.0.175.0 Safari/530.7",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.175.0 Safari/530.6",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.5",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_2; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/2.0.174.0 Safari/530.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.173.1 Safari/530.5",
"Mozilla/6.0 (Windows; U; Windows NT 6.0; en-US) Gecko/2009032609 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.7",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.6 Safari/530.5",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.43 Safari/530.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.40 Safari/530.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.39 Safari/530.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.2 Safari/530.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; eu) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.172.0 Safari/530.4",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/530.4 (KHTML, like Gecko) Chrome/2.0.172.0 Safari/530.4",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/2.0.172.0 Safari/530.5",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_0; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.2 Safari/528.10",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.9 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.9",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.11 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.11",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.10 (KHTML, like Gecko) Chrome/2.0.157.0 Safari/528.10",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/2.0.156.1 Safari/528.8",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/2.0.156.1 Safari/528.8",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/528.8 (KHTML, like Gecko) Chrome/2.0.156.0 Safari/528.8",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.59 Safari/525.19",
"Mozilla/4.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.59 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.55 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.53 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.50 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.43 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.43 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/1.0.154.43 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.3.155.0 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.0 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.152.0 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.151.0 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.151.0 Safari/525.19",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.6 Safari/525.13",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.30 Safari/525.13",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13",
"Mozilla/5.0 (Windows; U; Windows NT 6.0; de) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13",
"Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13(KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13",
"Mozilla/5.0 (Windows; U; Windows NT 5.0; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13",
"Mozilla/5.0 (Linux; U; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13",
"Mozilla/5.0 (Macintosh; U; Mac OS X 10_5_7; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.9 (KHTML, like Gecko) Chrome/ Safari/530.9",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.6 (KHTML, like Gecko) Chrome/ Safari/530.6",
"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_5_6; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5",
]
def get_ua():
return random.choice(USER_AGENTS)
def get_requests_headers():
headers = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
'Connection': 'close',
'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.zhipin.com/job_detail/?city=101020100&source=10&query=python',
'sec-ch-ua-platform': '"Android"',
'sec-ch-ua-mobile': '?1',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'upgrade-insecure-requests': '1'
}
return headers
if __name__ == '__main__':
# 模块检查
print(get_requests_headers())
response = requests.get('http://www.ip3366.net/?stype=1&page=1', headers=get_requests_headers())
print(response.content.decode("gb2312", "ignore"))

View File

@ -0,0 +1,18 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class BossjobItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pay = scrapy.Field() # 薪资
job_name = scrapy.Field() # 岗位
detail_url = scrapy.Field() # 职位详情链接
company_name = scrapy.Field() # 公司名称
requirement = scrapy.Field() # 要求
detail = scrapy.Field()

View File

@ -0,0 +1,184 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import random
import time
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
# 中间件1 -随机UA
from scrapy.http import HtmlResponse
from .requset import SeleniumRequest
from .fake_useragent import get_ua
class BossjobRandomuaDownloaderMiddleware(object):
def process_request(self, request, spider):
headers = get_ua()
request.headers['User-Agent'] = headers
return None
# 中间件2 -随机代理
import random
from .settings import proxy_list
class BossjobRandomProxyDownloadMiddleware(object):
def process_request(self, request, spider):
proxy = "u286.kdltps.com:15818"
# 用户名密码认证
username = "t17335887797243"
password = "n62s2uvp"
request.meta['proxy'] = "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password,
"proxy": proxy}
# 白名单认证
# request.meta['proxy'] = "http://%(proxy)s/" % {"proxy": proxy}
request.headers["Connection"] = "close"
return None
# 中间件3 -Cookie
from .fakeCookie import COOKIE_LIST
class BossjobCookieDownloaderMiddleware(object):
def process_request(self, request, spider):
cookie_dict = self.get_cookies()
request.cookies = cookie_dict
return None
def get_cookies(self):
cookie_string = 'wd_guid=544d13f9-f072-4fdc-9989-84452f1ecd52; historyState=state; _bl_uid=XtlO5cqLjv05qpj3t0d0nna8msI4; lastCity=101020100; wt2=DY4IX_Pe18l5jPqD0AYgnA-G9UnTNtDaZ_zMhCpK7UovHjn5bKxYiZ6NtwTrfsFzsgpxFtIBCopvwd7HdvXTGrg~~; wbg=0; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1673257271,1673333037,1673421249,1673621120; __g=-; __l=l=%2Fwww.zhipin.com%2Fjob_detail%2F01fd3a4e0ace71af1nx_0t-1F1pZ.html&s=3&friend_source=0&s=3&friend_source=0; geek_zp_token=V1RN0kEOL031ZiVtRvyB4eKymy7j3Vwi4~; __c=1673621123; __a=68265253.1672926940.1673421249.1673621123.475.11.15.475; __zp_stoken__=357feaV5aXwJLbUlmOy4uTW43dBlpeEsAbV5LT1RBZ10vQAMUSG4OBXFMIDkiIkJ0D3Z%2Bb35WOlduHEoVLlt3bnRiWQNiGnw7AgQdWhkjdlJNETohVUMiZCUfHx8IKAQ%2FTU9MDi1fN3RRXTk%3D; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1673621689'
cookie_dict = {}
for kv in cookie_string.split(';'):
k = kv.split('=')[0]
v = kv.split('=')[1]
cookie_dict[k] = v
return cookie_dict
import zipfile
import string
from selenium import webdriver
class seleniumDownloaderMiddleware(object):
def __init__(self):
self.option = webdriver.ChromeOptions()
def create_proxyauth_extension(tunnelhost, tunnelport, proxy_username, proxy_password, scheme='http',
plugin_path=None):
if plugin_path is None:
plugin_path = 'vimm_chrome_proxyauth_plugin.zip'
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
"""
background_js = string.Template(
"""
var config = {
mode: "fixed_servers",
rules: {
singleProxy: {
scheme: "${scheme}",
host: "${host}",
port: parseInt(${port})
},
bypassList: ["foobar.com"]
}
};
chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
function callbackFn(details) {
return {
authCredentials: {
username: "${username}",
password: "${password}"
}
};
}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{urls: ["<all_urls>"]},
['blocking']
);
"""
).substitute(
host=tunnelhost,
port=tunnelport,
username=proxy_username,
password=proxy_password,
scheme=scheme,
)
with zipfile.ZipFile(plugin_path, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
return plugin_path
proxyauth_plugin_path = create_proxyauth_extension(
tunnelhost="u286.kdltps.com", # 隧道域名
tunnelport="15818", # 端口号
proxy_username="t17335887797243", # 用户名
proxy_password="n62s2uvp" # 密码
)
self.option.add_extension(proxyauth_plugin_path)
# elf.option.add_argument('--headless')
self.option.add_experimental_option('excludeSwitches', ['enable-automation'])
self.option.add_experimental_option('excludeSwitches', ['enable-logging'])
self.option.add_experimental_option('useAutomationExtension', False)
self.option.add_argument('blink-settings=imagesEnabled=false')
self.option.add_argument("--no-sandbox")
self.option.add_argument("--disable-dev-shm-usage")
self.option.add_argument('--disable-gpu')
self.bro = webdriver.Chrome(executable_path='D:\爬虫\selenium\chromedriver.exe', options=self.option)
self.bro.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
def __del__(self):
self.bro.close()
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.__del__, signal=signals.spider_closed)
return s
def process_request(self, spider, request):
# 所有的请求都会到这里判断是否需要selenium来处理请求
if isinstance(request, SeleniumRequest):
# selenium操作
self.bro.get(request.url)
time.sleep(2)
page_text = self.bro.page_source
return HtmlResponse(url=request.url, status=200, body=page_text, request=request, encoding='utf-8')
else:
return None

View File

@ -0,0 +1,45 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
class BossjobPipeline:
def process_item(self, item, spider):
print(item['detail'])
return item
class mysqlPipeLine(object):
# 数据库连接
conn = None
cursor = None
def open_spider(self, spider):
self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='dxs666dxs', db='Spider',
charset='utf8')
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
try:
self.cursor.execute('insert into bossjob values("%s", "%s", "%s", "%s", "%s")' % (
item["company_name"], item["detail_url"], item["job_name"], item["pay"], item["requirement"]))
self.conn.commit()
print('成功插入', item['job_name'], '的工作信息到数据库中!')
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self, spider):
if self.cursor:
self.cursor.close()
if self.conn:
self.conn.close()

View File

@ -0,0 +1,5 @@
from scrapy import Request
class SeleniumRequest(Request):
pass

View File

@ -0,0 +1,108 @@
# Scrapy settings for bossjob project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'bossjob'
SPIDER_MODULES = ['bossjob.spiders']
NEWSPIDER_MODULE = 'bossjob.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# clear
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'zh-CN,zh;q=0.9'
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'bossjob.middlewares.BossjobSpiderMiddleware': 543,
# }
proxy_list = [
"61.216.185.88:60808",
"121.13.252.60:41564",
"202.109.157.64:9000",
"120.24.76.81:8123",
"210.5.10.87:53281",
"117.41.38.16:9000",
"117.41.38.18:9000",
"121.13.252.62:41564",
"112.14.47.6:52024",
"222.74.73.202:42055",
"121.13.252.58:41564",
"117.114.149.66:55443",
"27.42.168.46:55481",
"121.13.252.61:41564",
"183.236.232.160:8080",
]
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'bossjob.middlewares.BossjobRandomuaDownloaderMiddleware': 500,
'bossjob.middlewares.BossjobCookieDownloaderMiddleware': 400,
#'bossjob.middlewares.BossjobRandomProxyDownloadMiddleware': 98,
#'bossjob.middlewares.seleniumDownloaderMiddleware': 99,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'bossjob.pipelines.BossjobPipeline': 300,
'bossjob.pipelines.mysqlPipeLine': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,43 @@
import json
import scrapy
from ..items import BossjobItem
from lxml import etree
from ..requset import SeleniumRequest
class BossSpider(scrapy.Spider):
name = 'boss'
def start_requests(self):
for pageNum in range(51, 90):
url = f'https://www.zhipin.com/wapi/zpgeek/mobile/search/joblist.json?page={pageNum}&city=101020100&query='
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response, **kwargs):
res = json.loads(response.text)
it = {'html': res['zpData']['html']}
tree = etree.HTML(it['html'])
li_list = tree.xpath('//li')
for li in li_list:
item = BossjobItem()
job_name = li.xpath('./a/div[1]/span[1]/text()')[0]
item['job_name'] = job_name
detail_url = 'https://www.zhipin.com' + li.xpath('./a/@href')[0]
item['detail_url'] = detail_url
pay = li.xpath('a/div[1]/span[2]/text()')[0]
item['pay'] = pay
company_name = li.xpath('./a/div[2]/span[1]/text()')[0]
item['company_name'] = company_name
requirement = li.xpath('./a/div[3]//text()')
re = ''
for i in range(1, len(requirement)):
re = re + requirement[i].strip() + ' '
item['requirement'] = re
yield item

11
scrapy/bossjob/scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = bossjob.settings
[deploy]
#url = http://localhost:6800/
project = bossjob

Binary file not shown.

View File

View File

@ -0,0 +1,196 @@
import random
import requests
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
"Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
"Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14"
"Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
"Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
"Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
"Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
"Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
"Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
"Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
"Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
"Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
"Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62",
"Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52",
"Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51",
"Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50",
"Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50",
"Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11",
"Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
"Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10",
"Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10",
"Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
"Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1",
"Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01",
"Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01",
"Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
"Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
"Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01",
"Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
"Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
"Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
"Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
"Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
"Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
"Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
"Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
"Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
]
def get_ua():
return random.choice(USER_AGENTS)
def get_requests_headers():
headers = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'close',
'Accept-Encoding': 'gzip, deflate, br',
# 'Host': 'www.zhipin.com',
# 'Origin': 'https://www.zhipin.com',
# 'Referer': 'https://www.zhipin.com/',
}
if __name__ == '__main__':
# 模块检查
print(get_requests_headers())
response = requests.get('http://www.ip3366.net/?stype=1&page=1', headers=get_requests_headers())
print(response.content.decode("gb2312", "ignore"))

View File

@ -0,0 +1,12 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class CaipiaoItem(scrapy.Item):
qihao = scrapy.Field()
red_ball = scrapy.Field()
blue_ball = scrapy.Field()

View File

@ -0,0 +1,46 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import random
from time import sleep
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from scrapy.http import HtmlResponse
from .fake_useragent import USER_AGENTS
class CaipiaoDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def process_request(self, request, spider):
# UA伪装
request.headers['User-Agent'] = random.choice(USER_AGENTS)
return None
def process_response(self, request, response, spider):
bro = spider.bro
bro.get(request.url)
sleep(0.5)
click = bro.find_element_by_xpath('//*[@id="link248"]/img').click()
start = bro.find_element_by_id('from')
start.clear()
start.send_keys('16001')
end = bro.find_element_by_id('to')
end.clear()
end.send_keys('23004')
find = bro.find_element_by_id('link176').click()
page_text = bro.page_source
new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
return new_response
def process_exception(self, request, exception, spider):
pass

View File

@ -0,0 +1,63 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymysql
'''
存储数据的方案
1数据要存在csv文件中
2数据要存在mysql数据库中
3数据要存在mongodb数据库中
4.文件的存储
'''
class CaipiaoPipeline:
def open_spider(self, spider):
print('开始存储!')
self.f = open('./双色球.csv', mode='w', encoding='utf-8')
self.f.write("期数,红球号码,蓝球号码\n")
def close_spider(self, spider):
print('存储完毕!')
if self.f:
self.f.close()
def process_item(self, item, spider):
# print(item)
self.f.write(f"{item['qihao']},{' '.join(item['red_ball'])},{item['blue_ball']}\n")
return item
class mySQLPipeline:
def open_spider(self, spider):
print('开始存储!')
self.conn = pymysql.Connect(
host="localhost",
port=3306,
user="root",
password="dxs666dxs",
database="spider"
)
def close_spider(self, spider):
print('存储完毕!')
if self.conn:
self.conn.close()
def process_item(self, item, spider):
cur = self.conn.cursor()
sql = "insert into caipiao values(%s, %s, %s)"
try:
cur.execute(sql, (item['qihao'], item['red_ball'], item['blue_ball']))
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()

View File

@ -0,0 +1,89 @@
# Scrapy settings for caipiao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'caipiao'
SPIDER_MODULES = ['caipiao.spiders']
NEWSPIDER_MODULE = 'caipiao.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'caipiao (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'WARNING'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'caipiao.middlewares.CaipiaoSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'caipiao.middlewares.CaipiaoDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'caipiao.pipelines.CaipiaoPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,43 @@
import scrapy
from ..items import CaipiaoItem
from selenium import webdriver
from selenium.webdriver import ChromeOptions
class SeqSpider(scrapy.Spider):
name = 'seq'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://datachart.500.com/ssq/']
def __init__(self, **kwargs):
# 实现让selenium规避被检测到的风险
super().__init__(**kwargs)
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option('excludeSwitches', ['enable-logging'])
option.add_argument("--no-sandbox")
option.add_argument("--disable-dev-shm-usage")
option.add_argument("--window-size=1920,1080") # 建议设置窗口大小
option.add_argument('--headless')
option.add_argument('--disable-gpu')
# option.add_argument('blink-settings=imagesEnabled=false')
self.bro = webdriver.Chrome(executable_path='D:\爬虫\selenium\chromedriver.exe', options=option)
def closed(self, spider):
self.bro.quit()
def parse(self, response):
tr_list = response.xpath('//*[@id="tdata"]/tr')
for tr in tr_list:
item = CaipiaoItem()
# 过滤掉没用的标签
if tr.xpath('./@class').extract_first() == 'tdbck':
continue
qishu = tr.xpath('./td[1]/text()').extract_first().strip()
# 也可以用xpath: red_ball = tr.xpath("./td[@class="chartBall01"]/text()").extract()
red_ball = tr.css(".chartBall01::text").extract()
blue_ball = tr.css(".chartBall02::text").extract_first()
item['qihao'] = qishu
item['red_ball'] = red_ball
item['blue_ball'] = blue_ball
yield item

11
scrapy/caipiao/scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = caipiao.settings
[deploy]
#url = http://localhost:6800/
project = caipiao

View File

View File

@ -0,0 +1,197 @@
import random
import requests
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
"Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
"Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14"
"Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
"Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
"Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
"Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
"Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
"Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
"Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
"Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
"Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
"Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62",
"Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52",
"Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51",
"Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50",
"Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50",
"Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11",
"Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
"Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10",
"Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10",
"Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
"Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1",
"Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01",
"Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01",
"Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
"Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
"Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01",
"Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
"Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
"Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
"Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
"Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
"Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
"Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
"Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
"Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
]
def get_ua():
return random.choice(USER_AGENTS)
def get_requests_headers():
headers = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, br',
# 'Host': 'www.zhipin.com',
# 'Origin': 'https://www.zhipin.com',
'Referer': 'https://wz.sun0769.com/political/index/politicsNewest?id=1&page=1',
'upgrade-insecure-requests': '1',
}
if __name__ == '__main__':
# 模块检查
print(get_requests_headers())
response = requests.get('http://www.ip3366.net/?stype=1&page=1', headers=get_requests_headers())
print(response.content.decode("gb2312", "ignore"))

View File

@ -0,0 +1,13 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class ImgsproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
img_name = scrapy.Field()
img_src = scrapy.Field()

View File

@ -0,0 +1,145 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
class ImgsproSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class ImgsproDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
return None
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# 中间件1 -随机UA
from .fake_useragent import get_requests_headers
class imgsProRandomuaDownloaderMiddleware(object):
def process_request(self, request, spider):
headers = get_requests_headers()
request.headers['User-Agent'] = headers
# print(agent)
# 中间件2 -随机代理
# import random
# from .proxies import proxy_list
#
# class BossjobRandomProxyDownloadMiddleware(object):
# def process_requset(self, request, spider):
# proxy = random.choice(proxy_list)
# request.meta['proxy'] = proxy
# print(proxy)
#
# def process_exception(self, request, exception, spider):
# # 处理代理ip无法使用情况
# return request
# 中间件3 -Cookie
class imgsProCookieDownloaderMiddleware(object):
def process_request(self, request, spider):
cookie_dict = self.get_cookies()
request.cookies = cookie_dict
# print(cookie_dict)
def get_cookies(self):
cookie_string = 'cz_statistics_visitor=6a89d058-1928-b3b0-23ec-dd69be6c601a; __bid_n=184bced47869fe68784207; FPTOKEN=aJKftmn/cRusAPgCcLDE2nPw1f6AOJ8O2QUSZDc3c8DvI5BXZ30JDOFLJMgL1IRmUrXBPceos2w32lBfN2EV9YGfaTCJRsiUCa0hhZE/W7lV1yrRpNcTOHVpdJ+2coFSRUj1ah8fG8R959GOo63vzd2UuGRfjD+wf8giIlSk1FhVeFN28vpeiCScpwb6K6NH3Lu28AA/1idjRk6PUvVjZuUkUVAOb3zgBUtIvIlFH3Fy6PxnN0MYEFUBlXfGw+S5GRRrffN44WeiC1NzodYwUs78bOaxu6NxOp6a0LkOgoaWjCiGlF2sFTQNoOVMQcf3QZ+EGXVyKbhi1+YEmY4YrMMcQTkDgZGWtUlwhzkBjOi3pf8rT3axAIefUN12FZ7/D3D0tW59zkrNXqNNVbwPsg==|pnNJ+7La9ur/GH7QYr2dOE2BpmC7rfTIjxxwS6VDPJA=|10|1e90646f2dfd14de2376168eeb9968f4'
cookie_dict = {}
for kv in cookie_string.split(';'):
k = kv.split('=')[0]
v = kv.split('=')[1]
cookie_dict[k] = v
return cookie_dict

View File

@ -0,0 +1,31 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import scrapy
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
class ImgsproPipeline:
def process_item(self, item, spider):
print(item)
return item
class imgsPipeLine(ImagesPipeline):
# 根据图片地址进行图片数据的请求
def get_media_requests(self, item, info):
yield scrapy.Request(item['img_src'])
# 指定图片存储的路径
def file_path(self, request, response=None, info=None, *, item):
imgName = item['img_name']
return imgName
def item_completed(self, results, item, info):
return item # 返回给下一个即将被执行的管道类

View File

@ -0,0 +1,92 @@
# Scrapy settings for imgsPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'imgsPro'
SPIDER_MODULES = ['imgsPro.spiders']
NEWSPIDER_MODULE = 'imgsPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'imgsPro (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'imgsPro.middlewares.ImgsproSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'imgsPro.middlewares.imgsProCookieDownloaderMiddleware': 500,
'imgsPro.middlewares.imgsProRandomuaDownloaderMiddleware': 400,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'imgsPro.pipelines.ImgsproPipeline': 300,
'imgsPro.pipelines.imgsPipeLine': 400,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 指定图片存储的目录
IMAGES_STORE = './img_lib'

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,32 @@
import scrapy
from ..items import ImgsproItem
import re
class ImgSpider(scrapy.Spider):
name = 'img'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://sc.chinaz.com/tupian//']
page_num = 2
def parse(self, response):
div_list = response.xpath('/html/body/div[3]/div[2]/div')
for div in div_list:
item = ImgsproItem()
img_name = div.xpath('./img/@alt').extract()
img_name = ''.join(img_name) + '.jpg'
item['img_name'] = img_name
img_src = div.xpath('./img/@data-original').extract()
img_src = 'https:' + ''.join(img_src)
# 去掉_s以获取高清原图如果链接里面有_s是缩略图
s = re.sub('_s', '', img_src)
item['img_src'] = s
yield item
# 另一种分页操作
if self.page_num <= 3:
new_url = f'https://sc.chinaz.com/tupian/index_{self.page_num}.html'
self.page_num += 1
yield scrapy.Request(new_url, callback=self.parse)

11
scrapy/imgsPro/scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = imgsPro.settings
[deploy]
#url = http://localhost:6800/
project = imgsPro

View File

View File

@ -0,0 +1,194 @@
import random
import requests
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
"Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
"Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14"
"Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
"Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
"Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
"Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
"Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
"Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
"Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
"Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
"Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
"Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62",
"Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52",
"Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51",
"Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50",
"Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50",
"Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11",
"Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
"Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10",
"Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10",
"Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
"Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1",
"Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01",
"Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01",
"Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
"Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
"Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01",
"Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
"Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
"Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
"Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
"Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
"Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
"Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
"Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
"Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
]
def get_ua():
return random.choice(USER_AGENTS)
def get_requests_headers():
headers = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, br',
}
return headers
if __name__ == '__main__':
# 模块检查
print(get_requests_headers())
response = requests.get('http://www.ip3366.net/?stype=1&page=1', headers=get_requests_headers())
print(response.content.decode("gb2312", "ignore"))

View File

@ -0,0 +1,12 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class PaperItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

View File

@ -0,0 +1,41 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from .fake_useragent import get_ua
class PaperDownloaderMiddleware:
def process_request(self, request, spider):
# UA伪装
headers = get_ua()
request.headers['User-Agent'] = headers
return None
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
class CookieDownloaderMiddleware(object):
def process_request(self, request, spider):
cookie_dict = self.get_cookies()
request.cookies = cookie_dict
def get_cookies(self):
# cookie_string = ''
cookie_string = ''
cookie_dict = {}
for kv in cookie_string.split(';'):
k = kv.split('=')[0]
v = kv.split('=')[1]
cookie_dict[k] = v
return cookie_dict

View File

@ -0,0 +1,13 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class PaperPipeline:
def process_item(self, item, spider):
return item

View File

@ -0,0 +1,90 @@
# Scrapy settings for paper project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'paper'
SPIDER_MODULES = ['paper.spiders']
NEWSPIDER_MODULE = 'paper.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'paper (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'WARNING'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'paper.middlewares.PaperSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'paper.middlewares.PaperDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'paper.pipelines.PaperPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,28 @@
import scrapy
class PageSpider(scrapy.Spider):
name = 'page'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919']
def start_requests(self):
url = 'https://passport.17k.com/ck/user/login'
username = ''
password = ''
# 发送post的方案
yield scrapy.FormRequest(
url=url,
formdata={
'loginName': username,
'password': password
},
callback=self.parse
)
def parse(self, response, **kwargs):
yield scrapy.Request(url=self.start_urls[0], callback=self.detail_parse)
def detail_parse(self, response):
print(response.json())

11
scrapy/paper/scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = paper.settings
[deploy]
#url = http://localhost:6800/
project = paper

11
scrapy/sunPro/scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = sunPro.settings
[deploy]
#url = http://localhost:6800/
project = sunPro

View File

View File

@ -0,0 +1,196 @@
import random
import requests
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
"Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
"Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14"
"Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
"Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
"Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
"Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
"Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
"Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
"Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
"Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
"Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
"Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62",
"Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52",
"Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51",
"Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50",
"Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50",
"Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11",
"Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
"Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10",
"Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10",
"Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
"Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1",
"Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01",
"Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01",
"Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
"Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
"Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01",
"Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
"Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
"Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
"Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
"Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
"Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
"Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
"Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
"Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
]
def get_ua():
return random.choice(USER_AGENTS)
def get_requests_headers():
headers = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, br',
# 'Host': 'www.zhipin.com',
# 'Origin': 'https://www.zhipin.com',
# 'Referer': 'https://www.zhipin.com/',
}
if __name__ == '__main__':
# 模块检查
print(get_requests_headers())
response = requests.get('http://www.ip3366.net/?stype=1&page=1', headers=get_requests_headers())
print(response.content.decode("gb2312", "ignore"))

View File

@ -0,0 +1,23 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class SunproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
number = scrapy.Field()
title = scrapy.Field()
status = scrapy.Field()
content = scrapy.Field()
city = scrapy.Field()
time = scrapy.Field()
# class DetailItem(scrapy.Item):
# # define the fields for your item here like:
# # name = scrapy.Field()
# id = scrapy.Field()
# content = scrapy.Field()

View File

@ -0,0 +1,114 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
from time import sleep
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from scrapy.http import HtmlResponse
class SunproSpiderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.
# Should return None or raise an exception.
return None
def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.
# Must return an iterable of Request, or item objects.
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.
# Should return either None or an iterable of Request or item objects.
pass
def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesnt have a response associated.
# Must return only requests (not items).
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
class SunproDownloaderMiddleware:
def process_response(self, request, response, spider):
# 挑选出指定的响应对象进行篡改
# 通过url指定request通过request指定response
# 获取动态加载出的动态数据基于selenium
bro = spider.bro
bro.get(request.url)
sleep(0.1)
page_text = bro.page_source
new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
return new_response
# 中间件1 -随机UA
from .fake_useragent import get_requests_headers
class RandomuaDownloaderMiddleware(object):
def process_request(self, request, spider):
headers = get_requests_headers()
request.headers['User-Agent'] = headers
# print(agent)
# 中间件2 -随机代理
# import random
# from .proxies import proxy_list
#
# class RandomProxyDownloadMiddleware(object):
# def process_requset(self, request, spider):
# proxy = random.choice(proxy_list)
# request.meta['proxy'] = proxy
# print(proxy)
#
# def process_exception(self, request, exception, spider):
# # 处理代理ip无法使用情况
# return request
# 中间件3 -Cookie
class CookieDownloaderMiddleware(object):
def process_request(self, request, spider):
cookie_dict = self.get_cookies()
request.cookies = cookie_dict
# print(cookie_dict)
def get_cookies(self):
cookie_string = 'tgw_l7_route=581a2b818047111abece09009aea53ba; PHPSESSID=6sq7bpo9m0vsntmr1mq7othflj; Hm_lvt_8634401b25f1b0008d9638ccfc17752d=1673232337; Hm_lvt_3ac08b9ee936f8dd8b720065d8af23d0=1673232337; Hm_lpvt_3ac08b9ee936f8dd8b720065d8af23d0=1673233037; Hm_lpvt_8634401b25f1b0008d9638ccfc17752d=1673233037'
cookie_dict = {}
for kv in cookie_string.split(';'):
k = kv.split('=')[0]
v = kv.split('=')[1]
cookie_dict[k] = v
return cookie_dict

View File

@ -0,0 +1,48 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
import pymysql
from itemadapter import ItemAdapter
# class SunproPipeline:
# def process_item(self, item, spider):
# # 如何判断item的类型
# # 将数据写入数据库中,如何保证数据的一致性
# if item.__class__.__name__ == 'DetailItem':
# print(item['id'], item['content'])
# else:
# print(item['number'], item['title'])
# return item
class mysqlPipeLine(object):
# 数据库连接
conn = None
cursor = None
def open_spider(self, spider):
self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='dxs666dxs', db='Bossjob', charset='utf8')
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
try:
self.cursor.execute('insert into new values("%s", "%s", "%s", "%s", "%s", "%s")' %
(item['number'], item['title'], item['content'], item['status'], item['city'], item['time']))
self.conn.commit()
print('成功插入编号为', item['number'], '的数据!')
except Exception as e:
print(e)
print('error!')
self.conn.rollback()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()

View File

@ -0,0 +1,91 @@
# Scrapy settings for sunPro project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'sunPro'
SPIDER_MODULES = ['sunPro.spiders']
NEWSPIDER_MODULE = 'sunPro.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'sunPro (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'sunPro.middlewares.SunproSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'sunPro.middlewares.RandomuaDownloaderMiddleware': 543,
'sunPro.middlewares.CookieDownloaderMiddleware': 400,
'sunPro.middlewares.SunproDownloaderMiddleware': 300,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'sunPro.pipelines.mysqlPipeLine': 200,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,73 @@
import re
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from ..items import SunproItem
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://wz.sun0769.com/political/index/politicsNewest']
# 实例化一个浏览器对象
def __init__(self, **kwargs):
# 实现让selenium规避被检测到的风险
super().__init__(**kwargs)
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option('excludeSwitches', ['enable-logging'])
option.add_argument("--no-sandbox")
option.add_argument("--disable-dev-shm-usage")
option.add_argument("--window-size=1920,1080") # 建议设置窗口大小
option.add_argument('--headless')
option.add_argument('--disable-gpu')
option.add_argument('blink-settings=imagesEnabled=false')
self.bro = webdriver.Chrome(executable_path='D:\爬虫\selenium\chromedriver.exe', options=option)
def closed(self, spider):
self.bro.quit()
# 链接提取器: 根据指定规则(allow=r'正则表达式')进行指定链接提取
link = LinkExtractor(allow=r'id=1&page=\d', restrict_xpaths='/html/body/div[2]/div[3]/div[3]/div/a')
# link_detail = LinkExtractor(restrict_xpaths='/html/body/div[2]/div[3]/ul[2]/li/span[3]/a')
rules = (
# 规则解析器: 将链接提取器提取到的链接进行指定规则(callback)的解析操作
# follow=True: 可以将链接提取器继续作用到链接提取器提取到的链接所对应的页面中
Rule(link, callback='parse_item', follow=True),
# Rule(link_detail, callback='parse_detail'),
)
# 解析投诉的编号和标题
def parse_item(self, response):
li_list = response.xpath('/html/body/div[2]/div[3]/ul[2]/li')
for li in li_list:
item = SunproItem()
number = li.xpath('./span[1]/text()').extract_first()
item['number'] = number
status = li.xpath('./span[2]/text()').extract_first().strip()
item['status'] = status
title = li.xpath('./span[3]/a/text()').extract_first()
item['title'] = title
detail_url = 'https://wz.sun0769.com' + li.xpath('./span[3]/a/@href').extract_first()
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
# 解析投诉的内容
def parse_detail(self, response):
item = response.meta['item']
content = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre//text()').extract()
content = ''.join(content)
item['content'] = content
city = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[2]/text()').extract_first()
c = re.sub(' 来自:', '', city)
C = re.sub(' ', '', c)
item['city'] = C
time = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[3]/text()').extract_first()
item['time'] = time
# print(item)
yield item

12
scrapy/wangyi/news.txt Normal file
View File

@ -0,0 +1,12 @@
(1)加拿大将为乌克兰购买美制防空系统 俄方:荒谬:
来源:环球网【环球网报道 见习记者 李律杉】据路透社报道美加两国元首在墨西哥城会晤后加拿大总理特鲁多办公室周二10日发表声明称加拿大将为乌克兰购买美国制造的“国家先进地对空导弹系统”(NASAMS)。报道披露当天特鲁多和拜登正在墨西哥参加第十届北美领导人峰会两人在支持乌克兰方面进行了单独会晤。在此期间特鲁多告诉拜登加拿大将为乌克兰购买美制地空导弹系统一事。“这是加拿大首次向乌克兰捐赠防空系统。”加拿大国防部长安妮塔·阿南德在推特上写道。她还表示乌克兰防长列兹尼科夫10日早些时候在电话中告诉她得到防空系统是乌克兰的首要任务。阿南德介绍称NASAMS是一种中短程地面防空系统可抵御无人机、导弹和飞机的攻击。对于加拿大这一援乌决定俄罗斯驻加拿大大使奥列格·斯捷潘诺夫作出回应。据俄罗斯卫星通讯社报道斯捷潘诺夫在得知此事表示“特鲁多总理的内阁把钱花在进一步激化战争上支持一个距离加拿大上千公里之外的非法政权这看起来很荒谬。”“尤其荒谬的是这是在加拿大目前国内还面临着各种问题的背景下做出的决定。”另外根据加拿大总理办公室的声明特鲁多和拜登还就加拿大皇家空军采购F-35战斗机一事展开讨论。据央视新闻报道加拿大国防部长安妮塔·阿南德当地时间1月9日宣布加拿大已经签署了购买F-35战机的最终合同初期购买金额达190亿加元。据悉这88架战机中的第一架将在2026年之前交付而第一批F-35中队将在2029年之前投入使用。
(35)台媒57架次解放军军机进入台岛周边 "异常紧张":
来源:环球网【环球网报道】“解放军对台打击军演 57架次共机三面围台 我战机与地面飞弹紧盯”中国人民解放军东部战区1月8日位台岛周边海空域组织诸军兵种联合战备警巡和实战化演练第二天台湾中时新闻网以此为题渲染“气氛异常紧张”。台防务部门9日的说法宣称自8日上午6时至9日上午6时止“侦获”解放军军机57架次其中28架次逾越“台海中线”、军舰4艘次持续在台湾海峡周边活动。8日夜东部战区新闻发言人施毅陆军大校表示当天中国人民解放军东部战区位台岛周边海空域组织诸军兵种联合战备警巡和实战化演练重点演练对陆打击、对海突击等内容旨在检验部队联合作战能力坚决反击外部势力、“台独”分裂势力勾连挑衅行径。中时新闻网9日称解放军军机“扰台”范围明显扩大且集中在8日夜间台空军战机整夜不断紧急升空地面导弹部队更是进入高度警戒。台军还声称运用任务机、舰艇及岸基导弹系统“严密监控”与“应处”。中时新闻网还称台各空军基地8日晚气氛异常紧张从北到南甚至东部各基地战机接连紧急起飞架次比平常多状况如去年大陆军演一般不少住在基地周边的民众都感觉到一丝不寻常的气氛直到解放军东部战区发文才知道原因是大陆进行演练。此次演习距东部战区位台岛周边海空域演习还不到半个月2022年12月25日中国人民解放军东部战区位台岛周边海空域组织诸军兵种联合战备警巡和联合火力打击演练。这是针对当前美台升级勾连挑衅的坚决回应。此前的12月23日美国总统拜登签署“2023财年国防授权法案”其中一项内容是未来5年将对台提供总额100亿美元、每年最多20亿美元的“军事援助”。该法案还要求“加速处理台湾军购请求”并建议邀请台湾参与2024年“环太平洋军演”。这些严重违反一个中国原则和中美三个联合公报规定的恶性条款给台海和平稳定造成严重损害。
(34)德媒柏林正疯狂寻找向基辅承诺的40辆步兵战车:
来源中国新闻网中新网1月9日电 据德国《明镜》周刊报道德国正在“疯狂地”寻找给乌克兰承诺的40辆“黄鼠狼”步兵战车柏林将不得不从自己的武装力量储备中取出所承诺战车的大部分。报道称德国总理朔尔茨此前曾向基辅承诺了40辆“黄鼠狼”步兵战车目前联邦政府正在疯狂地寻找承诺的步兵战车。“德国政府尚未准备好供应此类军备这就是为什么德国国防军必须清空其仓库但它储备状态其实已经很差了。”德国联邦议院议员亨宁·奥特说道。报道指出当政府决定将“黄鼠狼”步兵战车交付给乌克兰德国军方、政界人士和安全专家都开始怀疑柏林将从哪里获得承诺的设备。朔尔茨的话“没那么容易实现”。消息显示德国国防企业莱茵金属(Rheinmetall)公司库存有近60辆有缺陷的“黄鼠狼”步兵战车但将其升级会需要很长时间。据报道德国总理朔尔茨与美国总统拜登5日通电话就向基辅运送重型军事装备达成一致。随后德国宣布拟向乌克兰供应40辆“黄鼠狼”步兵战车和1枚“爱国者”防空导弹。乌克兰局势升级以来德国已向乌克兰提供价值22.5亿欧元的武器和军事装备。

11
scrapy/wangyi/scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = wangyi.settings
[deploy]
#url = http://localhost:6800/
project = wangyi

View File

View File

@ -0,0 +1,194 @@
import random
import requests
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
"Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
"Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
"Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14"
"Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16",
"Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14",
"Mozilla/5.0 (Windows NT 6.0; rv:2.0) Gecko/20100101 Firefox/4.0 Opera 12.14",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0) Opera 12.14",
"Opera/12.80 (Windows NT 5.1; U; en) Presto/2.10.289 Version/12.02",
"Opera/9.80 (Windows NT 6.1; U; es-ES) Presto/2.9.181 Version/12.00",
"Opera/9.80 (Windows NT 5.1; U; zh-sg) Presto/2.9.181 Version/12.00",
"Opera/12.0(Windows NT 5.2;U;en)Presto/22.9.168 Version/12.00",
"Opera/12.0(Windows NT 5.1;U;en)Presto/22.9.168 Version/12.00",
"Mozilla/5.0 (Windows NT 5.1) Gecko/20100101 Firefox/14.0 Opera/12.0",
"Opera/9.80 (Windows NT 6.1; WOW64; U; pt) Presto/2.10.229 Version/11.62",
"Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.10.229 Version/11.62",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; de) Presto/2.9.168 Version/11.52",
"Opera/9.80 (Windows NT 5.1; U; en) Presto/2.9.168 Version/11.51",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; de) Opera 11.51",
"Opera/9.80 (X11; Linux x86_64; U; fr) Presto/2.9.168 Version/11.50",
"Opera/9.80 (X11; Linux i686; U; hu) Presto/2.9.168 Version/11.50",
"Opera/9.80 (X11; Linux i686; U; ru) Presto/2.8.131 Version/11.11",
"Opera/9.80 (X11; Linux i686; U; es-ES) Presto/2.8.131 Version/11.11",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/5.0 Opera 11.11",
"Opera/9.80 (X11; Linux x86_64; U; bg) Presto/2.8.131 Version/11.10",
"Opera/9.80 (Windows NT 6.0; U; en) Presto/2.8.99 Version/11.10",
"Opera/9.80 (Windows NT 5.1; U; zh-tw) Presto/2.8.131 Version/11.10",
"Opera/9.80 (Windows NT 6.1; Opera Tablet/15165; U; en) Presto/2.8.149 Version/11.1",
"Opera/9.80 (X11; Linux x86_64; U; Ubuntu/10.10 (maverick); pl) Presto/2.7.62 Version/11.01",
"Opera/9.80 (X11; Linux i686; U; ja) Presto/2.7.62 Version/11.01",
"Opera/9.80 (X11; Linux i686; U; fr) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; sv) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; en-US) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.1; U; cs) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 6.0; U; pl) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.2; U; ru) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.1; U;) Presto/2.7.62 Version/11.01",
"Opera/9.80 (Windows NT 5.1; U; cs) Presto/2.7.62 Version/11.01",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.13) Gecko/20101213 Opera/9.80 (Windows NT 6.1; U; zh-tw) Presto/2.7.62 Version/11.01",
"Mozilla/5.0 (Windows NT 6.1; U; nl; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
"Mozilla/5.0 (Windows NT 6.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.01",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; de) Opera 11.01",
"Opera/9.80 (X11; Linux x86_64; U; pl) Presto/2.7.62 Version/11.00",
"Opera/9.80 (X11; Linux i686; U; it) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.6.37 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; pl) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; ko) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; fi) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1; U; en-GB) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.1 x64; U; en) Presto/2.7.62 Version/11.00",
"Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1",
"Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0",
"Mozilla/5.0 (X11; Linux i586; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:31.0) Gecko/20130401 Firefox/31.0",
"Mozilla/5.0 (Windows NT 5.1; rv:31.0) Gecko/20100101 Firefox/31.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20120101 Firefox/29.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/29.0",
"Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0",
"Mozilla/5.0 (Windows NT 6.1; rv:27.3) Gecko/20130101 Firefox/27.3",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:27.0) Gecko/20121011 Firefox/27.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:25.0) Gecko/20100101 Firefox/25.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:24.0) Gecko/20100101 Firefox/24.0",
"Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:23.0) Gecko/20131011 Firefox/23.0",
"Mozilla/5.0 (Windows NT 6.2; rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:22.0) Gecko/20130328 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.1; rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Microsoft Windows NT 6.2.9200.0); rv:22.0) Gecko/20130405 Firefox/22.0",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:21.0.0) Gecko/20121011 Firefox/21.0.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (X11; Linux i686; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20130514 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; rv:21.0) Gecko/20130326 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20130330 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130328 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130401 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20130331 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.1; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 5.0; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0",
"Mozilla/5.0 (Windows NT 6.2; Win64; x64;) Gecko/20100101 Firefox/20.0",
"Mozilla/5.0 (Windows x86; rv:19.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Windows NT 6.1; rv:6.0) Gecko/20100101 Firefox/19.0",
"Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/18.0.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko",
"Mozilla/5.0 (compatible; MSIE 10.6; Windows NT 6.1; Trident/5.0; InfoPath.2; SLCC1; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET CLR 2.0.50727) 3gpp-gba UNTRUSTED/1.0",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)",
"Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)",
"Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)",
"Mozilla/4.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/5.0)",
"Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; WIndows NT 9.0; en-US))",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 7.1; Trident/5.0)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; Media Center PC 6.0; InfoPath.3; MS-RTC LM 8; Zune 4.7",
]
def get_ua():
return random.choice(USER_AGENTS)
def get_requests_headers():
headers = {
'User-Agent': random.choice(USER_AGENTS),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
'Connection': 'close',
'Accept-Encoding': 'gzip, deflate, br',
}
return headers
if __name__ == '__main__':
# 模块检查
print(get_requests_headers())
response = requests.get('http://www.ip3366.net/?stype=1&page=1', headers=get_requests_headers())
print(response.content.decode("gb2312", "ignore"))

View File

@ -0,0 +1,15 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class WangyiItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
number = scrapy.Field()

View File

@ -0,0 +1,52 @@
# Define here the models for your spider middleware
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import random
from scrapy import signals
# useful for handling different item types with a single interface
from itemadapter import is_item, ItemAdapter
from .fake_useragent import USER_AGENTS
from scrapy.http import HtmlResponse
from time import sleep
class WangyiDownloaderMiddleware:
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def process_request(self, request, spider):
# UA伪装
request.headers['User-Agent'] = random.choice(USER_AGENTS)
return None
def process_response(self, request, response, spider):
# 挑选出指定的响应对象进行篡改
# 通过url指定request通过request指定response
# 获取动态加载出的动态数据基于selenium
bro = spider.bro
if request.url in spider.models_url:
# 五大板块对应的响应对象
# 针对定位到的这些response进行篡改
# 实例化一个新响应对象,包含动态加载的新闻数据,用新的换旧的
bro.get(request.url)
sleep(0.5)
bro.execute_script('window.scrollTo(0,10000)')
page_text = bro.page_source
# self.fp = open('./news.html', 'w', encoding='utf-8')
# self.fp.write(page_text)
# self.fp.close()
new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
return new_response
else:
# 其他请求对应的响应对象
return response
def process_exception(self, request, exception, spider):
pass

View File

@ -0,0 +1,35 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class WangyiPipeline(object):
fp = None
# 重写父类的一个方法:该方法只在开始爬虫的时候被调用一次
def open_spider(self, spider):
print('开始爬虫!')
self.fp = open('./news.txt', 'w', encoding='utf-8')
# 专门用来处理item类型对象
# 该方法可以接受爬虫文件提交过来的item对象
# 该方法每接收到一个item就会被调用一次
def process_item(self, item, spider):
title = item['title']
content = item['content']
number = item['number']
print('正在下载第', number, '个新闻。。。')
# 持久化存储
self.fp.write('(' + str(number) + ')' + title + ':' + '\n' + content + '\n')
return item # 就会传递给下一个即将被执行的管道类
# 重写父类
def close_spider(self, spider):
print('结束爬虫!')
self.fp.close()

View File

@ -0,0 +1,89 @@
# Scrapy settings for wangyi project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'wangyi'
SPIDER_MODULES = ['wangyi.spiders']
NEWSPIDER_MODULE = 'wangyi.spiders'
LOG_LEVEL = 'ERROR'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'wangyi (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'wangyi.middlewares.WangyiSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'wangyi.middlewares.WangyiDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'wangyi.pipelines.WangyiPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

View File

@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

View File

@ -0,0 +1,68 @@
import scrapy
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from ..items import WangyiItem
class NewsSpider(scrapy.Spider):
name = 'news'
# allowed_domains = ['www.xxx.com']
start_urls = ['https://news.163.com/']
models_url = [] # 存放板块的详情页url
number = 1
# 实例化一个浏览器对象
def __init__(self, **kwargs):
# 实现让selenium规避被检测到的风险
super().__init__(**kwargs)
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_experimental_option('useAutomationExtension', False)
option.add_experimental_option('excludeSwitches', ['enable-logging'])
option.add_argument("--no-sandbox")
option.add_argument("--disable-dev-shm-usage")
option.add_argument("--window-size=1920,1080") # 建议设置窗口大小
option.add_argument('--headless')
option.add_argument('--disable-gpu')
self.bro = webdriver.Chrome(executable_path='D:\爬虫\selenium\chromedriver.exe', options=option)
def closed(self, spider):
self.bro.quit()
# 解析每一个板块对应的详情页url
# 每一个板块对应新闻相关的内容都是动态加载出来的
def detail_parse(self, response):
div_list = response.xpath('//div[@class="ndi_main"]/div[@class="data_row news_article clearfix news_first"] | //div[@class="ndi_main"]/div[@class="data_row news_article clearfix "]')
# print(div_list)
for div in div_list:
item = WangyiItem()
title = div.xpath('./div/div/h3/a/text()').extract_first()
item['title'] = title
item['number'] = self.number
self.number += 1
content_url = div.xpath('./div/div/h3/a/@href').extract_first()
yield scrapy.Request(url=content_url, callback=self.content_parse, meta={'item': item})
# 解析新闻内容
def content_parse(self, response):
item = response.meta['item']
content = response.xpath('//*[@id="content"]/div[2]//text()').extract()
content = ''.join(content)
item['content'] = content
# print(item)
yield item
# 解析五大板块的详情页url
def parse(self, response):
li_list = response.xpath('//*[@id="index2016_wrap"]/div[3]/div[2]/div[2]/div[2]/div/ul/li')
alist = [1, 2, 4, 5] # 存储各个领域的li标签编号
for index in alist:
model_url = li_list[index].xpath('./a/@href').extract_first()
# print(model_url)
self.models_url.append(model_url)
# 依次对每个板块进行发起请求
for url in self.models_url:
yield scrapy.Request(url=url, callback=self.detail_parse)

11
scrapy/xiaohua/scrapy.cfg Normal file
View File

@ -0,0 +1,11 @@
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.io/en/latest/deploy.html
[settings]
default = xiaohua.settings
[deploy]
#url = http://localhost:6800/
project = xiaohua

View File

View File

@ -0,0 +1,13 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class XiaohuaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()

Some files were not shown because too many files have changed in this diff Show More