简单例子展示爬虫在不同思想下的写法

简单例子展示爬虫在不同思想下的写法
注:以用户选择爬取百度贴吧不同主题的所选页面的源码为目的展现各种写法,各有各的好处。重点在于自己思考。

爬取前,首先找规律。
注意:因为发现其url有这样的规律所有可采用这种方法。
1.把不同页面url地址腾到一个地方,做一些判断和修改,再用修改后的url去搜索,看是否为正确的url,由下面的url:
(修改:对url不同地方做一些删减)
(测试:用浏览器打开修改后的url,看是否得到目标页面)

# https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&ie=utf-8&pn=0 *页
# https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&ie=utf-8&pn=50 第二页
# https://tieba.baidu.com/f?kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B&ie=utf-8&pn=100 第三页

得出以下格式:https://tieba.baidu.com/f?kw=(主题)[&ie=utf-8&]pn=(页数*50-50)
注:1.中括号内内容可有可无。(由修改和测试可得)
2.括号内是我们需要操作的,可传参的地方。

1.普通写法
from urllib import parse
from urllib import request

name = input(‘选择您要查看的主题:’)
start = int(input(‘选择起始页:’))
end = int(input(‘选择结束页:’))
headers = {
“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) ”
“AppleWebKit/537.36 (KHTML, like Gecko)”
” Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46″
}

base_url = ‘https://tieba.baidu.com/f?kw=’
for i in range(start, end + 1):
num = (i – 1) * 50
url = base_url + parse.quote(name) + ‘&ie=utf-8&pn=’ + str(num)
req = request.Request(url, headers=headers)
response = request.urlopen(req)
html = response.read().decode(‘utf-8’)
file_name = ‘第’ + str(i) + ‘页内容.html’
with open(file_name, ‘w’, encoding=’utf-8′) as file_obj:
print(‘正在爬取第%d页’ % i)
file_obj.write(html)

2.函数式写法
from urllib import parse
from urllib import request

# 获取数据
def read_url(url):
headers = {
“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) ”
“AppleWebKit/537.36 ”
“(KHTML, like Gecko)”
” Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46″
}
req = request.Request(url, headers=headers)
response = request.urlopen(req)
html = response.read().decode(‘utf-8’)
return html

# 写入数据
def write_page(file_name, html):
with open(file_name, ‘w’, encoding=’utf-8′) as file_obj:
file_obj.write(html)
print(“写入成功”)

# 主函数,其他的都写入其中
def main():
name = input(‘选择您要查看的主题:’)
start = int(input(‘选择起始页:’))
end = int(input(‘选择结束页:’))
base_url = ‘https://tieba.baidu.com/f?kw=’
for i in range(start, end + 1):
num = (i – 1) * 50
url = base_url + parse.quote(name) + ‘&ie=utf-8&pn=’ + str(num)
file_name = ‘第’ + str(i) + ‘页内容.html’
html = read_url(url)
write_page(file_name, html)

if __name__ == ‘__main__’:
main()

3.面向对象写法
from urllib import parse
from urllib import request

class BaiduSpider:
def __init__(self):
self.headers = {
“User-Agent”: “Mozilla/5.0 (Windows NT 10.0; Win64; x64) ”
“AppleWebKit/537.36 (KHTML, like Gecko)”
” Chrome/90.0.4430.85 Safari/537.36 Edg/90.0.818.46″
}
self.base_url = ‘https://tieba.baidu.com/f?kw=’

def read_page(self, url):
req = request.Request(url, headers=self.headers)
response = request.urlopen(req)
html = response.read().decode(‘utf-8’)
return html

def write_page(self, file_name, html):
with open(file_name, ‘w’, encoding=’utf-8′) as file_obj:
file_obj.write(html)
print(“写入成功”)

def main(self):
name = input(‘选择您要查看的主题:’)
start = int(input(‘选择起始页:’))
end = int(input(‘选择结束页:’))
for i in range(start, end + 1):
num = (i – 1) * 50
url = self.base_url + parse.quote(name) + ‘&ie=utf-8&pn=’ + str(num)
file_name = ‘第’ + str(i) + ‘页内容.html’
html = self.read_page(url)
self.write_page(file_name, html)

if __name__ == ‘__main__’:
yes = BaiduSpider()
yes.main()