通过robots.txt中的Sitemap 寻找网站各个入口地址
# -*- coding:utf-8 -*-
import requests
from lxml import etree
def get_sitemapinfo(robots_url):
"""
功能:取得robots.txt中的Sitemap网址
返回sitemap 例如 /news/upload/ueditor/image/202208/txnbaw4ndav.xml
:param robots_url
:return: /news/upload/ueditor/image/202208/txnbaw4ndav.xml
"""
response = requests.get(robots_url).text
try:
link = response.split("Sitemap:")[-1].strip()
return link
except:
print("当前网站robots协议 未包含Sitemap")
def get_links(sitemap_url,rule):
"""
功能:取得Sitemap下所有的入口地址
返回links 例如 ['/news/upload/ueditor/image/202208/cqjxahtugtt '/news/upload/ueditor/image/202208/xkg0czyqoia '/news/upload/ueditor/image/202208/4kfmkiijyso
:param sitemap_url sitemap的地址
:param rule xpath匹配规则
:return: /news/upload/ueditor/image/202208/txnbaw4ndav.xml
"""
response = requests.get(sitemap_url)
r = etree.HTML(response.text.encode("UTF-8"))
links = r.xpath(rule)
return links
if __name__ == "__main__":
## 开始执行程序
# robots.txt地址
url = "/news/upload/ueditor/image/202208/wpoaqwtdt3k.txt"
sitemap_url = get_sitemapinfo(robots_url=url)
links = get_links(sitemap_url=sitemap_url, rule="//url/loc/text()")
print(f"links:{links}")
版权声明
本文仅代表作者观点,不代表博信信息网立场。