最近帮朋友写个简单爬虫,顺便整理了下,搞成了一个带GUI界面的小说爬虫工具,用来从笔趣阁爬取小说。
多线程采集,一个线程采集一本小说
支持使用代理,尤其是多线程采集时,不使用代理可能封ip
实时输出采集结果
使用
threading.BoundedSemaphore() pool_sema.acquire() pool_sema.release() 来限制线程数量,防止并发线程过。具体限制数量,可在软件界面输入,默认个线程
所有线程任务开始前pool_sema.threading.BoundedSemaphore() 具体每个线程开始前 锁pool_sema.acquire() .... # 线程任务执行结束释放pol_sema.release()
pip install requestspip install pysimpleguipip install lxmlpip install pyinstaller
GUI 界面使用了一个tkinter 的封装库 PySimpleGUI, 使用非常方便,虽然界面不够漂亮,但胜在简单,非常适合开发些小工具。
pysimplegui.readthedocs.io/en/latest/ 比如这个界面的布局,只需简单几个 list
layout = [ [sg.Text(输入要爬取的小说网址,点此打开笔趣阁站点复制, font=("微软雅黑", ), key="openwebsite", enable_events=True, tooltip="点击在浏览器中打开")], [sg.Text("小说目录页url,一行一个:")], [ sg.Multiline(, key="url", size=(, ), autoscroll=True, expand_x=True, right_click_menu=[&Right, [粘贴]] ) ], [sg.Text(visible=False, text_color="#ff", key="error")], [ sg.Button(button_text=开始采集, key="start", size=(, )), sg.Button(button_text=打开下载目录, key="opendir", size=(, ), button_color="#") ], [sg.Text(填写ip代理,有密码格式 用户名:密码@ip:端口,无密码格式 ip:端口。如 demo:@...:)], [ sg.Input(, key="proxy"), sg.Text(线程数量:), sg.Input(, key="threadnum"), ], [ sg.Multiline(等待采集, key="res", disabled=True, border_width=, background_color="#ffffff", size=( , ), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, font=("宋体", ), text_color="#") ], ]
pyinstaller -Fw start.py
import timeimport requestsimport osimport sysimport reimport randomfrom lxml import etreeimport webbrowserimport PySimpleGUI as sgimport threading# user-agentheader = { "User-Agent": "Mozilla/. (Windows NT .; Win; x) AppleWebKit/. (KHTML, like Gecko) Chrome/... Safari/."}# 代理proxies = {}# 删除书名中特殊符号# 笔趣阁基地址baseurl = https://www.xbiquwx.la/# 线程数量threadNum = pool_sema = NoneTHREAD_EVENT = -THREAD-cjstatus = False# txt存储目录filePath = os.path.abspath(os.path.join(os.getcwd(), txt))if not os.path.exists(filePath): os.mkdir(filePath)# 删除特殊字符def deletetag(text): return re.sub(r[[]#/\:*,;?"<>|()《》&^!~=%{}@!:。·!¥……() ],,text)# 入口def main(): global cjstatus, proxies, threadNum, pool_sema sg.theme("reddit") layout = [ [sg.Text(输入要爬取的小说网址,点此打开笔趣阁站点复制, font=("微软雅黑", ), key="openwebsite", enable_events=True, tooltip="点击在浏览器中打开")], [sg.Text("小说目录页url,一行一个:")], [ sg.Multiline(, key="url", size=(, ), autoscroll=True, expand_x=True, right_click_menu=[&Right, [粘贴]] ) ], [sg.Text(visible=False, text_color="#ff", key="error")], [ sg.Button(button_text=开始采集, key="start", size=(, )), sg.Button(button_text=打开下载目录, key="opendir", size=(, ), button_color="#") ], [sg.Text(填写ip代理,有密码格式 用户名:密码@ip:端口,无密码格式 ip:端口。如 demo:@...:)], [ sg.Input(, key="proxy"), sg.Text(线程数量:), sg.Input(, key="threadnum"), ], [ sg.Multiline(等待采集, key="res", disabled=True, border_width=, background_color="#ffffff", size=( , ), no_scrollbar=False, autoscroll=True, expand_x=True, expand_y=True, font=("宋体", ), text_color="#") ], ] window = sg.Window(采集笔趣阁小说, layout, size=(, ), resizable=True,) while True: event, values = window.read() if event == sg.WIN_CLOSED or event == close: # if user closes window or clicks cancel break if event == "openwebsite": webbrowser.open(%s % baseurl) elif event == opendir: os.system(start explorer + filePath) elif event == start: if cjstatus: cjstatus = False window[start].update(已停止...点击重新开始) continue window[error].update("", visible=False) urls = values[url].strip().split(" ") lenth = len(urls) for k, url in enumerate(urls): if (not re.match(r%sd+_d+/ % baseurl, url.strip())): if len(url.strip()) > : window[error].update("地址错误:%s" % url, visible=True) del urls[k] if len(urls) < : window[error].update( "每行地址需符合 %s_/ 形式" % baseurlr, visible=True) continue # 代理 if len(values[proxy]) > : proxies = { "http": "http://%s" % values[proxy], "https": "http://%s" % values[proxy] } # 线程数量 if values[threadnum] and int(values[threadnum]) > : threadNum = int(values[threadnum]) pool_sema = threading.BoundedSemaphore(threadNum) cjstatus = True window[start].update(采集中...点击停止) window[res].update(开始采集) for url in urls: threading.Thread(target=downloadbybook, args=( url.strip(), window,), daemon=True).start() elif event == "粘贴": window[url].update(sg.clipboard_get()) print("event", event) if event == THREAD_EVENT: strtext = values[THREAD_EVENT][] window[res].update(window[res].get()+" "+strtext) cjstatus = False window.close()#下载def downloadbybook(page_url, window): try: bookpage = requests.get(url=page_url, headers=header, proxies=proxies) except Exception as e: window.write_event_value( -THREAD-, (threading.current_thread().name, 请求 %s 错误,原因:%s % (page_url, e))) return if not cjstatus: return # 锁线程 pool_sema.acquire() if bookpage.status_code != : window.write_event_value( -THREAD-, (threading.current_thread().name, 请求%s错误,原因:%s % (page_url, page.reason))) return bookpage.encoding = utf- page_tree = etree.HTML(bookpage.text) bookname = page_tree.xpath(//div[@id="info"]/h/text())[] bookfilename = filePath + / + deletetag(bookname)+.txt zj_list = page_tree.xpath( //div[@class="box_con"]/div[@id="list"]/dl/dd) for _ in zj_list: if not cjstatus: break zjurl = page_url + _.xpath(./a/@href)[] zjname = _.xpath(./a/@title)[] try: zjpage = requests.get( zjurl, headers=header, proxies=proxies) except Exception as e: window.write_event_value(-THREAD-, (threading.current_thread( ).name, 请求%s:%s错误,原因:%s % (zjname, zjurl, zjpage.reason))) continue if zjpage.status_code != : window.write_event_value(-THREAD-, (threading.current_thread( ).name, 请求%s:%s错误,原因:%s % (zjname, zjurl, zjpage.reason))) return zjpage.encoding = utf- zjpage_content = etree.HTML(zjpage.text).xpath(//div[@id="content"]/text()) content = " 【"+zjname+"】 " for _ in zjpage_content: content += _.strip() + with open(bookfilename, a+, encoding=utf-) as fs: fs.write(content) window.write_event_value( -THREAD-, (threading.current_thread().name, %s:%s 采集成功 % (bookname, zjname))) time.sleep(random.uniform(., .)) # 下载完毕 window.write_event_value(-THREAD-, (threading.current_thread( ).name, 请求 %s 结束 % page_url)) pool_sema.release()if __name__ == __main__: main(