用Python爬取金融市场数据

发布时间：2019-01-17 23:31:45 所属栏目：教程来源：隠塵退士
导读：一、写在前面因为在平常的事变中，必要对某信任网的信任在售和资管在售数据举办统计说明，可是一条一条的输入，显然过分耗时耗力，于是萌生了写个爬虫的设法。一门计较机说话，可以当做是在仿照人的目标或意图来举办一系列举动或举措，以是在写代码之前
⑥ 生涯数据到当地(以dataframe名目生涯到当地CSV名目)
# 生涯数据为dataframe名目CSV文件 
    df = pd.DataFrame(items) 
    df.to_csv('data.csv',index=False,sep=',',encoding='utf-8-sig') 
 
好了，此刻就大功告成了，最后不要只让本身爽，也要让对方的处事器别太惆怅，在一些处所休眠几秒，完备代码如下。 
 
import urllib.request 
import urllib.parse 
import re 
import random 
from bs4 import BeautifulSoup 
import pandas as pd 
import time 
 
# 界说第1个分函数joint，用来拼接url 
def joint(url,size=None,page=None,type=None,id=None): 
    if len(url) > 45: 
        condition = 'producttype:' + type + '|status:在售' 
        data = { 
        'mode': 'statistics', 
        'pageSize': size, 
        'pageIndex': str(page), 
        'conditionStr': condition, 
        'start_released': '', 
        'end_released': '', 
        'orderStr': '1', 
        'ascStr': 'ulup' 
        } 
        joint_str = urllib.parse.urlencode(data) 
        url_new = url + joint_str 
    else: 
        data = { 
            'id':id 
            } 
        joint_str = urllib.parse.urlencode(data) 
        url_new = url + joint_str 
    return url_new 
 
# 界说第2个函数que_res，用来构建request发送哀求，并返反相应response 
def que_res(url): 
 
    # 构建request的第一步——构建头部：headers 
    USER_AGENTS = [  
        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", 
        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", 
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", 
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", 
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", 
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", 
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", 
        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 
        ] 
    user_agent = random.choice(USER_AGENTS) 
    headers = { 
        'Accept-Language': 'zh-CN,zh;q=0.8', 
        'Connection': 'keep-alive',  
        'Host': 'www.某信任网.com', 
        'Referer': 'http://www.某信任网.com/Product/Index.aspx', 
        'User-Agent': user_agent, 
        'X-Requested-With': 'XMLHttpRequest' 
        } 
 
    # 构建request的第二步——构建request 
    request = urllib.request.Request(url=url, headers=headers) 
 
 
    # 提倡哀求的第一步——构建署理池 
    proxy_list = [       
        {'http':'125.40.29.100:8118'}, 
        {'http':'14.118.135.10:808'} 
        ] 
    proxy = random.choice(proxy_list) 
 
    # 提倡哀求的第二步——建设handler和opener 
    handler = urllib.request.ProxyHandler(proxy) 
    opener = urllib.request.build_opener(handler) 
 
    # 提倡哀求的第三步——提倡哀求，获取相应内容并解码 
    response = opener.open(request).read().decode() 
 
    # 返回值 
    return response 
 
# 界说第3个函数parse_content_1，用来理会并匹配第一层网页内容，此处行使正则表达式要领 
def parse_content_1(response): 
 
    # 写正则举办所需数据的匹配 
    re_1 = re.compile( 
    r'{"ROWID".*?"ID":"(.*?)","Title":"(.*?)","producttype".*?"issuers":"(.*?)","released":"(.*?) 0:00:00","PeriodTo":(.*?),"StartPrice".*?"moneyinto":"(.*?)","EstimatedRatio1":(.*?),"status":.*?"}') 
    contents = re_1.findall(response) 
    return contents 
 
# 界说第4个函数parse_content_2，用来理会并匹配第二层网页内容，并输出数据，此处行使BeautifulSoup要领 
def parse_content_2(response,content): 
 
    # 行使bs4举办爬取第二层信息 
    soup = BeautifulSoup(response) 
 
    # 爬取刊行地和收益分派方法，该信息位于id为procon1下的table下的第4个tr里 
    tr_3 = soup.select('#procon1 > table > tr')[3]         #select到第四个方针tr 
    address = tr_3.select('.pro-textcolor')[0].text        #select到该tr下的class为pro-textcolor的第一个内容（刊行地） 
    r_style = tr_3.select('.pro-textcolor')[1].text        #select到该tr下的class为pro-textcolor的第二个内容（收益分派方法） 
 
    # 爬取刊行局限，该信息位于id为procon1下的table下的第5个tr里 
    tr_4 = soup.select('#procon1 > table > tr')[4]         #select到第五个方针tr     
    guimo = tr_4.select('.pro-textcolor')[1].text          #select到该tr下的class为pro-textcolor的第二个内容（刊行局限：至***万） 
    re_2 = re.compile(r'.*?(d+).*?', re.S)                #设立一个正则表达式，将纯数字提取出来 
    scale = re_2.findall(guimo)[0]                         #提取出纯数字的刊行局限 
 
    # 爬取收益率，该信息位于id为procon1下的table下的第8个tr里 
    tr_7 = soup.select('#procon1 > table > tr')[7]         #select到第八个方针tr 
    rate = tr_7.select('.pro-textcolor')[0].text[:(-1)]    #select到该tr下的class为pro-textcolor的第一个内容（且通过下标[-1]将末端的 % 去除） 
    r = rate.split('至')                                   #此处用来提取最低收益和最高收益 
    r_min = r[0] 
    r_max = r[1] 
 
    # 提取利率品级 
    tr_11 = soup.select('#procon1 > table > tr')[11]       #select到第十二个方针tr 
    r_grade = tr_11.select('p')[0].text                    #select到该tr下的p下的第一个内容（即利率品级） 
 
    # 生涯数据到一个字典中 
    item = { 
    '产物名称':content[1], 
    '刊行机构':content[2], 
    '刊行时刻':content[3], 
    '产物限期':content[4], 
    '投资行业':content[5], 
    '首页收益':content[6], 
    '刊行地': address, 
    '收益分派方法': r_style, 
    '刊行局限': scale, 
    '最低收益': r_min, 
    '最高收益': r_max, 
    '利率品级': r_grade 
    } 
 
    # 返回数据 
    return item 
 
# 界说一个主函数 
def main(): 
 
    # 写入相干数据 
    url_1 = 'http://www.某信任网.com/Action/ProductAJAX.ashx?' 
    url_2 = 'http://www.某信任网.com/Product/Detail.aspx?' 
    size = input('请输入每页表现数目:') 
    start_page = int(input('请输入起始页码:')) 
    end_page = int(input('请输入竣事页码')) 
    type = input('请输入产物范例(1代表信任，2代表资管):')  
    items = []                       # 界说一个空列表用来存储数据 
 
    # 写轮回爬取每一页 
    for page in range(start_page, end_page + 1): 
 
        # 第一层网页的爬取流程 
        print('第{}页开始爬取'.format(page)) 
        # 1、拼接url——可界说一个分函数1：joint 
        url_new = joint(url_1,size=size,page=page,type=type) 
 
        # 2、提倡哀求，获取相应——可界说一个分函数2：que_res 
        response = que_res(url_new) 
 
        # 3、理会内容，获取所需数据——可界说一个分函数3：parse_content_1 
        contents = parse_content_1(response) 
 
        # 4、休眠2秒 
        time.sleep(2) 
 
        # 第二层网页的爬取流程 
 
        for content in contents: 
            print('    第{}页{}开始下载'.format(page,content[0])) 
            # 1、拼接url 
            id = content[0] 
            url_2_new = joint(url_2,id=id)      # joint为前面界说的第1个函数 
 
            # 2、提倡哀求，获取相应 
            response_2 = que_res(url_2_new)     # que_res为前面界说的第2个函数 
 
            # 3、理会内容，获取所需数据——可界说一个分函数4：parse_content_2，直接返回字典名目标数据 
            item = parse_content_2(response_2,content) 
 
            # 存储数据 
            items.append(item) 
            print('    第{}页{}竣事下载'.format(page,content[0])) 
            # 休眠5秒 
            time.sleep(5) 
 
        print('第{}页竣事爬取'.format(page)) 
 
 
    # 生涯数据为dataframe名目CSV文件 
    df = pd.DataFrame(items) 
    df.to_csv('data.csv',index=False,sep=',',encoding='utf-8-sig') 
 
    print('*'*30) 
    print('所有爬取竣事') 
 
if __name__ == '__main__': 
    main()
3、爬取功效
（编辑：湖南网）
【声明】本站内容均来自网络，其相关言论仅代表作者个人观点，不代表本站立场。若无意侵犯到您的权利，请及时与联系站长删除相关内容!
11/17
首页
尾页
教你如何安装ghost xp	深度技术Ghost xp系统
ghost xp sp3电脑公司	8187无线网卡驱动,教您