Python-bs4:练习爬取贴吧和小说信息

获取百度贴吧 标题&帖子链接、发帖人、回复数量、帖子发帖日期。
# -*- coding:utf8 -*-
import requests
from bs4 import BeautifulSoup
"""
# 标题&帖子链接:
<a rel="noreferrer" href="/p/5737326375" title="【姿原】大爆炸1-11。gao青。完整。中英字" target="_blank" class="j_th_tit ">【姿原】大爆炸1-11。gao青。完整。中英字</a>
#发帖人:
<span class="tb_icon_author " title="主题作者: 你的小掰呀" data-field="{"user_id":3478899584}"><i class="icon_author"></i><span class="frs-author-name-wrap"><a rel="noreferrer" data-field="{"un":"\u4f60\u7684\u5c0f\u63b0\u5440"}" class="frs-author-name j_user_card " href="/home/main/?un=%E4%BD%A0%E7%9A%84%E5%B0%8F%E6%8E%B0%E5%91%80&ie=utf-8&fr=frs" target="_blank">你的小掰呀</a></span><span class="icon_wrap  icon_wrap_theme1 frs_bright_icons "></span>    </span>
#回复数量:
<span class="threadlist_rep_num center_text" title="回复">214</span>
#发帖日期:
<span class="pull-right is_show_create_time" title="创建时间">6-8</span>
"""


def get_html(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return "ERR22OR"

def get_content(url):
    '''
    分析贴吧的网页文件,整理信息,保存在列表变量中
    '''
    # 初始化一个列表来保存所有的帖子信息:
    comments=[]
    html=get_html(url)
    soup=BeautifulSoup(html,'lxml')
    liTags=soup.find_all('li',attrs={'class':' j_thread_list clearfix'})
    for li in liTags:
        # 初始化一个字典来存储文章信息
        comment={}
        try:
            # 开始筛选信息,并保存到字典中
            comment['title']=li.find('a',attrs={'class':'j_th_tit'}).text.strip()
            comment['link']="http:tieba.baidu.com/"+li.find('a',attrs={'class':'j_th_tit'})['href']
            comment['name']=li.find('span',attrs={'class':'tb_icon_author '}).text.strip()
            comment['time']=li.find('span',attrs={'class':'pull-right is_show_create_time'}).text
            comment['replyNum']=li.find('span',attrs={'class':'threadlist_rep_num center_text'}).text.strip()
            comments.append(comment)
            print(comment)
        except:
            print('wrong')
    return comments

def OUT2FILE(dict):
    with open('E:/TTBT.txt','a+',errors='ignore') as f:
        for comment in dict:
            f.write('标题: {} \t 链接:{} \t 发帖人:{} \t 发帖时间:{} \t 回复数量: {} \n'.format(comment['title'],comment['link'], comment['name'], comment['time'], comment['replyNum']))

        print('当前页面爬取完成')

def main(base_url,deep):
    url_list=[]
    # 将所有需要爬取的url存入列表
    for i in range(0,deep):
        url_list.append(base_url+'&pn='+str(50*i))
    print('所有的网页已经下载到本地! 开始筛选信息。。。。')
    #循环写入所有的数据
    for url in url_list:
        content=get_content(url)
        OUT2FILE(content)
    print('所有的信息都已经保存完毕!')

base_url = 'http://tieba.baidu.com/f?kw=%E7%94%9F%E6%B4%BB%E5%A4%A7%E7%88%86%E7%82%B8&ie=utf-8'
# 设置需要爬取的页码数量
deep=3

if __name__ == '__main__':
    main(base_url, deep)

图片.png
图片.png
二、爬取笔趣阁小说网站排行榜小说及其链接、获取单本小说的所有章节链接、 获取小说每个章节的文本,并写入到本地

# -*- coding:utf8 -*-
import requests
from bs4 import BeautifulSoup

def get_html(url):
    try:
        r=requests.get(url,timeout=30)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text
    except:
        return "ERR22OR"

##获取排行榜小说及其链接:
def get_content(url):
    url_list=[]
    html=get_html(url)
    soup=BeautifulSoup(html,'lxml')
    category_list=soup.find_all('div',attrs={'class':'index_toplist mright mbottom'})
    history_finished_list=soup.find_all('div',attrs={'class':'index_toplist mbottom'})
    for cate in category_list:
        i=0
        name=cate.find('div',class_='toptab').span.string
        with open('E:/novel_list.csv','a+') as f:
            f.write("\n小说种类:{} \n".format(name))
            print("小说种类:%s"%name)

        general_list=cate.find(style="display: block;")
        book_list=general_list.find_all('li')
        for book in book_list:
            i=i+1
            link='http://www.qu.la'+book.a['href']
            title=book.a['title']
            title=book.a['title']
            url_list.append(link)
            with open('E:/novel_list.csv','a') as f:
                f.write("小说名:{:<} \t 小说地址:{:<} \n".format(title,link))
                print("第%d名小说名:%s\t小说地址:%s"%(i,title,link))
    for cate in history_finished_list:
        i=0
        name = cate.find('div',attrs={'class':'toptab'}).span.string
        with open('E:/novel_list.csv', 'a') as f:
            f.write("\n小说种类:{} \n".format(name))
            print("小说种类:%s" % name)

        general_list = cate.find(style='display: block;')
        book_list = general_list.find_all('li')
        for book in book_list:
            i=i+1
            link = 'http://www.qu.la' + book.a['href']
            title = book.a['title']
            url_list.append(link)
            with open('E:/novel_list.csv', 'a') as f:
                f.write("小说名:{:<} \t 小说地址:{:<} \n".format(title, link))
                print("第%d名小说名:%s\t小说地址:%s"%(i,title,link))

    return url_list

##获取单本小说的所有章节链接:
def get_txt_url(url):
    url_list = []
    html=get_html(url)
    soup=BeautifulSoup(html,'lxml')
    lista=soup.find_all('dd')
    txt_name=soup.find('h1').text
    with open('E:/xiaoshuo/{}.txt'.format(txt_name),'a')as f:
        f.write('小说标题:{} \n'.format(txt_name))
        print("小说标题%s\n"%txt_name)
    for url in lista:
        title_name=url.a.string
        url_list.append('http://www.qu.la'+url.a['href'])
        with open('E:/xiaoshuo/{}.txt'.format(txt_name), 'a')as f:
            f.write('%s 链接:http://www.qu.la%s \n'%(title_name,url.a['href']))
            print('%s 链接:http://www.qu.la%s'%(title_name,url.a['href']))

    return url_list,txt_name

###    获取小说每个章节的文本,并写入到本地
def get_one_txt(url):
    html=get_html(url).replace('<br>','\n')
    soup=BeautifulSoup(html,'lxml')

    txt=soup.find('div',id='content').text.replace('chaptererror();','')
    title=soup.find('title').text
    with open('E:/xiaoshuo/{}.txt'.format(title),'a',encoding='utf-8') as f:
        f.write('\t\t\t\t\t\t'+title+'\n\n')
        f.write(txt)
        print(txt)
        print('当前章节{} 已经下载完毕'.format(title))


##笔趣阁:https://www.qu.la/paihangbang/
url1='https://www.qu.la/paihangbang/'
url2=input("请输入需要爬取章节的小说网址:")
url3=str(input("请输入需要爬取内容的章节网址:"))
if __name__ == '__main__':
    get_content(url1)
    get_txt_url(url2)
    get_one_txt(url3)

图片.png图片.png图片.png图片.png

参考

从零开始写Python爬虫