Crawl4AI 智能体网络自动采集利器

Crawl是一款免费的开源工具，利用AI技术简化网络爬取和数据提取，提高信息收集与分析的效率。它智能识别网页内容，并将数据转换为易于处理的格式，功能全面且操作简便。

1 使用 Crawl 的步骤

步骤 1：安装与设置


pip install “crawl4ai @ git+" transformers torch nltk
pip install “crawl4ai @ git+" transformers torch nltk
pip install “crawl4ai @ git+" transformers torch nltk

步骤 2：数据提取

创建Python脚本，启动网络爬虫并从URL提取数据：


from crawl4ai import WebCrawler# 创建 WebCrawler 的实例crawler = WebCrawler()# 预热爬虫（加载必要的模型）crawler.warmup()# 在 URL 上运行爬虫result = crawler.run(url="https://openai.com/api/pricing/")# 打印提取的内容print(result.markdown)
from crawl4ai import WebCrawler# 创建 WebCrawler 的实例crawler = WebCrawler()# 预热爬虫（加载必要的模型）crawler.warmup()# 在 URL 上运行爬虫result = crawler.run(url="https://openai.com/api/pricing/")# 打印提取的内容print(result.markdown)
from crawl4ai import WebCrawler# 创建 WebCrawler 的实例crawler = WebCrawler()# 预热爬虫（加载必要的模型）crawler.warmup()# 在 URL 上运行爬虫result = crawler.run(url="https://openai.com/api/pricing/")# 打印提取的内容print(result.markdown)

步骤 3：数据结构化

使用LLM（大型语言模型）定义提取策略，将数据转换为结构化格式：


import osfrom crawl4ai import WebCrawlerfrom crawl4ai.extraction_strategy import LLMExtractionStrategyfrom pydantic import BaseModel, Fieldclass OpenAIModelFee(BaseModel):model_name: str = Field(..., description="OpenAI 模型的名称。")input_fee: str = Field(..., description="OpenAI 模型的输入令牌费用。")output_fee: str = Field(..., description="OpenAI 模型的输出令牌费用。")url = 'https://openai.com/api/pricing/'crawler = WebCrawler()crawler.warmup()result = crawler.run(url=url,word_count_threshold=1,extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),schema=OpenAIModelFee.schema(),extraction_type="schema",instruction="""从爬取的内容中提取所有提到的模型名称以及它们的输入和输出令牌费用。不要遗漏整个内容中的任何模型。提取的模型 JSON 格式应该像这样：{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""),bypass_cache=True,)print(result.extracted_content)
import osfrom crawl4ai import WebCrawlerfrom crawl4ai.extraction_strategy import LLMExtractionStrategyfrom pydantic import BaseModel, Fieldclass OpenAIModelFee(BaseModel):model_name: str = Field(..., description="OpenAI 模型的名称。")input_fee: str = Field(..., description="OpenAI 模型的输入令牌费用。")output_fee: str = Field(..., description="OpenAI 模型的输出令牌费用。")url = 'https://openai.com/api/pricing/'crawler = WebCrawler()crawler.warmup()result = crawler.run(url=url,word_count_threshold=1,extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),schema=OpenAIModelFee.schema(),extraction_type="schema",instruction="""从爬取的内容中提取所有提到的模型名称以及它们的输入和输出令牌费用。不要遗漏整个内容中的任何模型。提取的模型 JSON 格式应该像这样：{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""),bypass_cache=True,)print(result.extracted_content)
import osfrom crawl4ai import WebCrawlerfrom crawl4ai.extraction_strategy import LLMExtractionStrategyfrom pydantic import BaseModel, Fieldclass OpenAIModelFee(BaseModel):model_name: str = Field(..., description="OpenAI 模型的名称。")input_fee: str = Field(..., description="OpenAI 模型的输入令牌费用。")output_fee: str = Field(..., description="OpenAI 模型的输出令牌费用。")url = 'https://openai.com/api/pricing/'crawler = WebCrawler()crawler.warmup()result = crawler.run(url=url,word_count_threshold=1,extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),schema=OpenAIModelFee.schema(),extraction_type="schema",instruction="""从爬取的内容中提取所有提到的模型名称以及它们的输入和输出令牌费用。不要遗漏整个内容中的任何模型。提取的模型 JSON 格式应该像这样：{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""),bypass_cache=True,)print(result.extracted_content)

步骤 4：集成AI智能体

将 Crawl 与 Praison CrewAI 智能体集成，实现高效的数据处理：


pip install praisonai
pip install praisonai
pip install praisonai

创建工具文件（tools.py）来包装 Crawl 工具：


# tools.pyimport osfrom crawl4ai import WebCrawlerfrom crawl4ai.extraction_strategy import LLMExtractionStrategyfrom pydantic import BaseModel, Fieldfrom praisonai_tools import BaseToolclass ModelFee(BaseModel):llm_model_name: str = Field(..., description="模型的名称。")input_fee: str = Field(..., description="模型的输入令牌费用。")output_fee: str = Field(..., description="模型的输出令牌费用。")class ModelFeeTool(BaseTool):name: str = "ModelFeeTool"description: str = "从给定的定价页面提取模型的费用信息。"def _run(self, url: str):crawler = WebCrawler()crawler.warmup()result = crawler.run(url=url,word_count_threshold=1,extraction_strategy= LLMExtractionStrategy(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY'),schema=ModelFee.schema(),extraction_type="schema",instruction="""从爬取的内容中提取所有提到的模型名称以及它们的输入和输出令牌费用。不要遗漏整个内容中的任何模型。提取的模型 JSON 格式应该像这样：{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""),bypass_cache=True,)return result.extracted_contentif __name__ == "__main__":# 测试 ModelFeeTooltool = ModelFeeTool()url = "https://www.openai.com/pricing"result = tool.run(url)print(result)
# tools.pyimport osfrom crawl4ai import WebCrawlerfrom crawl4ai.extraction_strategy import LLMExtractionStrategyfrom pydantic import BaseModel, Fieldfrom praisonai_tools import BaseToolclass ModelFee(BaseModel):llm_model_name: str = Field(..., description="模型的名称。")input_fee: str = Field(..., description="模型的输入令牌费用。")output_fee: str = Field(..., description="模型的输出令牌费用。")class ModelFeeTool(BaseTool):name: str = "ModelFeeTool"description: str = "从给定的定价页面提取模型的费用信息。"def _run(self, url: str):crawler = WebCrawler()crawler.warmup()result = crawler.run(url=url,word_count_threshold=1,extraction_strategy= LLMExtractionStrategy(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY'),schema=ModelFee.schema(),extraction_type="schema",instruction="""从爬取的内容中提取所有提到的模型名称以及它们的输入和输出令牌费用。不要遗漏整个内容中的任何模型。提取的模型 JSON 格式应该像这样：{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""),bypass_cache=True,)return result.extracted_contentif __name__ == "__main__":# 测试 ModelFeeTooltool = ModelFeeTool()url = "https://www.openai.com/pricing"result = tool.run(url)print(result)
# tools.pyimport osfrom crawl4ai import WebCrawlerfrom crawl4ai.extraction_strategy import LLMExtractionStrategyfrom pydantic import BaseModel, Fieldfrom praisonai_tools import BaseToolclass ModelFee(BaseModel):llm_model_name: str = Field(..., description="模型的名称。")input_fee: str = Field(..., description="模型的输入令牌费用。")output_fee: str = Field(..., description="模型的输出令牌费用。")class ModelFeeTool(BaseTool):name: str = "ModelFeeTool"description: str = "从给定的定价页面提取模型的费用信息。"def _run(self, url: str):crawler = WebCrawler()crawler.warmup()result = crawler.run(url=url,word_count_threshold=1,extraction_strategy= LLMExtractionStrategy(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY'),schema=ModelFee.schema(),extraction_type="schema",instruction="""从爬取的内容中提取所有提到的模型名称以及它们的输入和输出令牌费用。不要遗漏整个内容中的任何模型。提取的模型 JSON 格式应该像这样：{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}."""),bypass_cache=True,)return result.extracted_contentif __name__ == "__main__":# 测试 ModelFeeTooltool = ModelFeeTool()url = "https://www.openai.com/pricing"result = tool.run(url)print(result)

AI智能体配置

配置AI智能体使用Crawl工具进行网络抓取和数据提取。在crewai框架下，我们设定了三个核心角色，共同完成网站模型定价信息的提取任务：

整个流程无需额外依赖，各角色独立完成各自任务。

2 AI 智能体应用实例

以Crawl为基础，Praison-AI智能体能够执行网络抓取、数据清洗和分析工作。它们相互协作，从多个网站抓取定价数据，并汇总成详尽的报告，以展示分析结果。

3 结语

Crawl是一个强大的工具，它赋予AI智能体更高的效率和准确性执行网络爬取和数据提取任务。其开源特性、AI驱动的能力和多功能性，使其成为构建智能且数据驱动智能体的宝贵资产。

本文转载自，作者：

文章版权声明 1、本网站名称：朵贝贝家具网
2、本站永久网址：http://www.dbbjjxs.com
3、本网站的文章部分内容可能来源于网络，仅供大家学习与参考，如有侵权，请联系站长进行删除处理。
4、本站一切资源不代表本站立场，并不代表本站赞同其观点和对其真实性负责。
5、本站一律禁止以任何方式发布或转载任何违法的相关信息，访客发现请向站长举报

#开源大模型 #文心一言 #紫东太初 #OpenAI #悟道 #AI #人工智能 #通义千问 #清言 #智能体 #AIGC应用 #日日新 #ChatGPT #网络 #LaMDA #4 #混元 #盘古 #Agent #大模型 #言犀 #Sora #多模态 #云雀 #Crawl4AI #Copilot #孟子 #GPT #Bard #AIGC