python怎么爬取b站关注列表及数据库的设计与操作-mile米乐体育
开发技术
2022年05月19日 10:35
2
python怎么爬取b站关注列表及数据库的设计与操作
这篇文章主要介绍了python怎么爬取b站关注列表及数据库的设计与操作的相关知识,内容详细易懂,操作简单快捷,具有一定借鉴价值,相信大家阅读完这篇python怎么爬取b站关注列表及数据库的设计与操作文章都会有所收获,下面我们一起来看看吧。
一、数据库的设计与操作
1、数据的分析
b站的关注列表在
https://api.bilibili.com/x/relation/followings?vmid=uid&pn=1&ps=50&order=desc&order_type=attention
中,一页最多50条信息。
我们大致分析一下信息,
{"code":0,"message":"0","ttl":1,"data":{"list":[{……
首先,列表内容存在data:list里。
其次,对于列表中每一项,有如下信息
"mid":672353429,"attribute":2,"mtime":1630510107,"tag":null,"special":0,"contract_info":{"is_contractor":false,"ts":0,"is_contract":false,"user_attr":0},"uname":"贝拉kira","face":"http://i2.hdslb.com/bfs/face/668af440f8a8065743d3fa79cfa8f017905d0065.jpg","sign":"元气满满的a-soul舞担参上~目标topidol,一起加油!","official_verify":{"type":0,"desc":"虚拟偶像团体a-soul所属艺人"},"vip":{"viptype":2,"vipduedate":1674576000000,"dueremark":"","accessstatus":0,"vipstatus":1,"vipstatuswarn":"","themetype":0,"label":{"path":"","text":"年度大会员","label_theme":"annual_vip","text_color":"#ffffff","bg_style":1,"bg_color":"#fb7299","border_color":""},"avatar_subscript":1,"nickname_color":"#fb7299","avatar_subscript_url":"http://i0.hdslb.com/bfs/vip/icon_certification_big_member_22_3x.png"}
其中,mid为用户独一无二的uid,viptype,0是什么都没开,1是大会员,2是年度大会员,official_verify中,type 0代表官方认证,-1代表没有官方认证。
同时我们发现,如果对方锁了列表,会返回
{"code":-400,"message":"请求错误","ttl":1}
2、数据库设计
基于这些,我们先设计数据库,包含两张表,用户信息的基本属性表和关注的关系表。
defcreatedb():link=sqlite3.connect('bilifollowdb.db')print("databaseopensuccess")usertableddl='''createtableifnotexistsuser(uidintprimarykeynotnull,namevarcharnotnull,signvarchardefaultnull,viptypeintnotnull,verifytypeintnotnull,verifydescvarchardefaultnull)'''relationtableddl='''createtableifnotexistsrelation(followerintnotnull,followingintnotnull,followtimeintnotnull,primarykey(follower,following),foreignkey(follower,following)referencesuser(uid,uid))'''#createusertablelink.execute(usertableddl)#createrelationtablelink.execute(relationtableddl)print("databasecreatesuccess")link.commit()link.close()
3、数据库操作
其次是插入新用户的列表,我的思路是爬完一个人的关注列表,把一整个list丢给该函数,判断是否存在新增用户,存在则把新增用户传回,作为下一次爬虫的起点。
definsertuser(infos):conn=sqlite3.connect('bilifollowdb.db')link=conn.cursor()insertcmd="insertintouser(uid,name,viptype,verifytype,sign,verifydesc)values(?,?,?,?,?,?);"existcmd="selectcount(uid)fromuserwhereuid='%d';"#%uidnewid=[]forinfoininfos:answer=link.execute(existcmd%info['uid'])forrowinanswer:exist_id=row[0]ifexist_id==0:newid.append(info['uid'])link.execute(insertcmd,(info['uid'],info['name'],info['viptype'],info['verifytype'],info['sign'],info['verifydesc']))conn.commit()conn.close()returnnewid
然后是插入关系的函数,这个比较简单
definsertfollowing(uid:int,subscribe):conn=sqlite3.connect('bilifollowdb.db')link=conn.cursor()insertcmd="insertintorelation(follower,following,followtime)values(?,?,?);"forfollowinsubscribe:link.execute(insertcmd,(uid,follow[0],follow[1]))conn.commit()conn.close()
二、爬虫
通过观察,我们发现睿叔叔锁了5页的关注列表
即使是人工操作也只能访问5页,那没办法啦,我们就爬5页吧。
defgetfollowinglist(uid:int):url="https://api.bilibili.com/x/relation/followings?vmid=%d&pn=%d&ps=50&order=desc&order_type=attention&jsonp=jsonp"#%(uid,pagenumber)infos=[]subscribe=[]foriinrange(1,6):html=requests.get(url%(uid,i))ifhtml.status_code!=200:print("geterror!")text=html.textdic=json.loads(text)ifdic['code']==-400:breaklist=dic['data']['list']forusrinlist:info={}info['uid']=usr['mid']info['name']=usr['uname']info['viptype']=usr['vip']['viptype']info['verifytype']=usr['official_verify']['type']info['sign']=usr['sign']ifinfo['verifytype']==-1:info['verifydesc']='null'else:info['verifydesc']=usr['official_verify']['desc']subscribe.append((usr['mid'],usr['mtime']))infos.append(info)newid=insertuser(infos)insertfollowing(uid,subscribe)returnnewid
三、完整代码
#byconcyclics#-*-coding:utf-8-*-importsqlite3importjsonimportrequestsdefcreatedb():link=sqlite3.connect('bilifollowdb.db')print("databaseopensuccess")usertableddl='''createtableifnotexistsuser(uidintprimarykeynotnull,namevarcharnotnull,signvarchardefaultnull,viptypeintnotnull,verifytypeintnotnull,verifydescvarchardefaultnull)'''relationtableddl='''createtableifnotexistsrelation(followerintnotnull,followingintnotnull,followtimeintnotnull,primarykey(follower,following),foreignkey(follower,following)referencesuser(uid,uid))'''#createusertablelink.execute(usertableddl)#createrelationtablelink.execute(relationtableddl)print("databasecreatesuccess")link.commit()link.close()definsertuser(infos):conn=sqlite3.connect('bilifollowdb.db')link=conn.cursor()insertcmd="insertintouser(uid,name,viptype,verifytype,sign,verifydesc)values(?,?,?,?,?,?);"existcmd="selectcount(uid)fromuserwhereuid='%d';"#%uidnewid=[]forinfoininfos:answer=link.execute(existcmd%info['uid'])forrowinanswer:exist_id=row[0]ifexist_id==0:newid.append(info['uid'])link.execute(insertcmd,(info['uid'],info['name'],info['viptype'],info['verifytype'],info['sign'],info['verifydesc']))conn.commit()conn.close()returnnewiddefinsertfollowing(uid:int,subscribe):conn=sqlite3.connect('bilifollowdb.db')link=conn.cursor()insertcmd="insertintorelation(follower,following,followtime)values(?,?,?);"forfollowinsubscribe:try:link.execute(insertcmd,(uid,follow[0],follow[1]))except:print((uid,follow[0],follow[1]))conn.commit()conn.close()defgetfollowinglist(uid:int):url="https://api.bilibili.com/x/relation/followings?vmid=%d&pn=%d&ps=50&order=desc&order_type=attention&jsonp=jsonp"#%(uid,pagenumber)infos=[]subscribe=[]foriinrange(1,6):html=requests.get(url%(uid,i))ifhtml.status_code!=200:print("geterror!")return[]text=html.textdic=json.loads(text)ifdic['code']==-400:return[]try:list=dic['data']['list']except:return[]forusrinlist:info={}info['uid']=usr['mid']info['name']=usr['uname']info['viptype']=usr['vip']['viptype']info['verifytype']=usr['official_verify']['type']info['sign']=usr['sign']ifinfo['verifytype']==-1:info['verifydesc']='null'else:info['verifydesc']=usr['official_verify']['desc']subscribe.append((usr['mid'],usr['mtime']))infos.append(info)newid=insertuser(infos)insertfollowing(uid,subscribe)returnnewiddefgetfollowinguid(uid:int):url="https://api.bilibili.com/x/relation/followings?vmid=%d&pn=%d&ps=50&order=desc&order_type=attention&jsonp=jsonp"#%(uid,pagenumber)foriinrange(1,6):html=requests.get(url%(uid,i))ifhtml.status_code!=200:print("geterror!")return[]text=html.textdic=json.loads(text)ifdic['code']==-400:return[]try:list=dic['data']['list']except:return[]ids=[]forusrinlist:ids.append(usr['mid'])returnidsdefwork(root):idlist=roottmplist=[]whilelen(idlist)!=0:tmplist=[]foridinidlist:print(id)tmplist =getfollowinglist(id)idlist=tmplistdefrework():conn=sqlite3.connect('bilifollowdb.db')link=conn.cursor()selectcmd="selectuidfromuser;"answer=link.execute(selectcmd)ids=[]forrowinanswer:ids.append(row[0])conn.commit()conn.close()newid=[]print(ids)foridinids:ids=getfollowinguid(id)foridinids:ifidnotinids:newid.append(id)returnnewidif__name__=="__main__":createdb()#work([**putrootuidhere**,])
关于“python怎么爬取b站关注列表及数据库的设计与操作”这篇文章的内容就介绍到这里,感谢各位的阅读!相信大家对“python怎么爬取b站关注列表及数据库的设计与操作”知识都有一定的了解,大家如果还想学习更多知识,欢迎关注恰卡编程网行业资讯频道。
展开全文