# 
import requests
import re
import os
import urllib.request
import random
class SseCrawl():
    def __init__(self):
        self.url = "http://www.sse.com.cn/disclosure/listedinfo/announcement/json/stock_bulletin_publish_order.json?v=0.46853839377888784"
        self.headers =[{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.116 Safari/537.36'},]
        self.server = "http://www.sse.com.cn/"
        self.root_pattern = re.compile(r'{"([sS]*?)},')
        self.pdf_pattern = re.compile(r'"bulletinUrl":"([sS]*?)","securityCode"')   
        self.name_pattern = re.compile(r'"bulletinTitle":"([sS]*?)","bulletinClassic"')
    def get_html(self):
        r = requests.get(self.url)
        r.encoding = 'utf-8'
        htmls = r.text        
        return htmls
    def analysis(self,htmls):
        root_htmls = re.findall(self.root_pattern, htmls)    
        anchors = []
        for html in root_htmls:  
            root_pdf = re.findall(self.pdf_pattern, html)   
            url = self.server + "".join(root_pdf)   
            root_name = re.findall(self.name_pattern, html)    
            anchor = {'name':root_name, 'address' : url}
            anchors.append(anchor)
        return anchors
    def download(self,anchors):
        os.mkdir('Pdf_Download')    
        os.chdir(os.path.join(os.getcwd(), 'Pdf_Download'))    
        for anchor in anchors:
            file_url = "".join(anchor['address'])  
            req_data = urllib.request.Request(file_url, headers=self.headers)
            u = urllib.request.urlopen(req_data)   
            req_data = request.Request(anchor['address'], headers=headers[random.randint(0, 9)])
            u = request.urlopen(req_data)"""
            f = open("".join(anchor['name']), 'wb')    
            block_sz = 8192    # 因为UFS默认大小是8192字节(8KB)
            while True:
                buffer = u.read(block_sz)
                if not buffer:
                    break
                f.write(buffer)
            f.close()
    def main(self):
        htmls = self.get_html()
        anchors = self.analysis(htmls)
        self.download(anchors)
spider = SseCrawl()
spider.main()
上交所最新公告PDF下载代码-python
 香橼树专利智慧平台通知书批量下载pdf
          
        « 上一篇
         电子投标好帮手——好用的PDF分割软件
          
      下一篇 »