这篇文章主要介绍了基于xpath选择器、PyQuery、正则表达式的格式清理工具,本文给大家介绍的非常详细,对大家的学习或工作具有一定的参考借鉴价值,需要的朋友可以参考下
1,使用xpath清理不必要的标签元素,以及无内容标签
from lxml import etree
def xpath_clean(self, text: str, xpath_dict: dict) -> str:
xpath 清除不必要的元素
:param text: html_content
:param xpath_dict: 清除目标xpath
:return: string type html_content
remove_by_xpath = xpath_dict if xpath_dict else dict()
# 必然清除的项目 除非极端情况 一般这些都是要清除的
remove_by_xpath.update({
_remove_2: //iframe,
_remove_4: //button,
_remove_5: //form,
_remove_6: //input,
_remove_7: //select,
_remove_8: //option,
_remove_9: //textarea,
_remove_10: //figure,
_remove_11: //figcaption,
_remove_12: //frame,
_remove_13: //video,
_remove_14: //script,
_remove_15: //style
})
parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True)
selector = etree.HTML(text, parser=parser)
# 常规删除操作,不需要的标签删除
for xpath in remove_by_xpath.values():
for bad in selector.xpath(xpath):
bad_string = etree.tostring(bad, encoding=utf-8,
pretty_print=True).decode()
logger.debug(f”clean article content : {bad_string}”)
bad.getparent().remove(bad)
skip_tip = “name()=img or name()=tr or ”
“name()=th or name()=tbody or ”
&