#
import requests
import re
import os
import urllib.request
import random
class SseCrawl():
def __init__(self):
self.url = "http://www.sse.com.cn/disclosure/listedinfo/announcement/json/stock_bulletin_publish_order.json?v=0.46853839377888784"
self.headers =[{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.116 Safari/537.36'},]
self.server = "http://www.sse.com.cn/"
self.root_pattern = re.compile(r'{"([sS]*?)},')
self.pdf_pattern = re.compile(r'"bulletinUrl":"([sS]*?)","securityCode"')
self.name_pattern = re.compile(r'"bulletinTitle":"([sS]*?)","bulletinClassic"')
def get_html(self):
r = requests.get(self.url)
r.encoding = 'utf-8'
htmls = r.text
return htmls
def analysis(self,htmls):
root_htmls = re.findall(self.root_pattern, htmls)
anchors = []
for html in root_htmls:
root_pdf = re.findall(self.pdf_pattern, html)
url = self.server + "".join(root_pdf)
root_name = re.findall(self.name_pattern, html)
anchor = {'name':root_name, 'address' : url}
anchors.append(anchor)
return anchors
def download(self,anchors):
os.mkdir('Pdf_Download')
os.chdir(os.path.join(os.getcwd(), 'Pdf_Download'))
for anchor in anchors:
file_url = "".join(anchor['address'])
req_data = urllib.request.Request(file_url, headers=self.headers)
u = urllib.request.urlopen(req_data)
req_data = request.Request(anchor['address'], headers=headers[random.randint(0, 9)])
u = request.urlopen(req_data)"""
f = open("".join(anchor['name']), 'wb')
block_sz = 8192 # 因为UFS默认大小是8192字节(8KB)
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
def main(self):
htmls = self.get_html()
anchors = self.analysis(htmls)
self.download(anchors)
spider = SseCrawl()
spider.main()
上交所最新公告PDF下载代码-python
香橼树专利智慧平台通知书批量下载pdf
« 上一篇
电子投标好帮手——好用的PDF分割软件
下一篇 »