https://python.langchain.com/v0.2/docs/integrations/document_loaders/
Copy
% pip install --user - Uq langchain langchain_community pypdf pdf2image docx2txt pdfminer
webBaseLoader
https://python.langchain.com/v0.2/docs/integrations/document_loaders/web_base/)
Copy % pip install --user - Uq beautifulsoup4
from langchain_community . document_loaders import WebBaseLoader
loader = WebBaseLoader ( "https://www.thisisgame.com/webzine/news/nboard/4/?n=189952" )
data = loader . load ()
print (data[ 0 ].page_content)
Copy loader = WebBaseLoader ([ "https://www.espn.com/" , "https://google.com" ])
docs = loader . load ()
docs
Load multiple urls concurrently
Copy % pip install --user - Uq nest_asyncio
# fixes a bug with asyncio and jupyter
import nest_asyncio
nest_asyncio . apply ()
Copy loader = WebBaseLoader ([ "https://www.espn.com/" , "https://google.com" ])
loader . requests_per_second = 1
docs = loader . aload ()
docs
xml parser
Copy loader = WebBaseLoader (
"https://www.govinfo.gov/content/pkg/CFR-2018-title10-vol3/xml/CFR-2018-title10-vol3-sec431-86.xml"
)
loader . default_parser = "xml"
docs = loader . load ()
docs
sitemap loader
Copy % pip install --user - Uq nest_asyncio
# fixes a bug with asyncio and jupyter
import nest_asyncio
nest_asyncio . apply ()
Copy from langchain_community . document_loaders . sitemap import SitemapLoader
Copy sitemap_loader = SitemapLoader (
web_path = "https://api.python.langchain.com/sitemap.xml" )
docs = sitemap_loader . load ()
Copy # Filtering sitemap URLs
loader = SitemapLoader (
web_path = "https://api.python.langchain.com/sitemap.xml" ,
filter_urls = [ "https://api.python.langchain.com/en/latest" ],
)
documents = loader . load ()
Copy sitemap_loader = SitemapLoader (
web_path = "https://beta.thisisgame.com/sitemap.xml" ,
filter_urls = [
"https://beta.thisisgame.com/articles/265823"
],
)
sitemap_loader . requests_per_second = 2
docs = sitemap_loader . load ()
docs
์ด์ ๊ธ์ ์์ ํ์
Copy #filter url
sitemap_loader = SitemapLoader (
web_path = "https://beta.thisisgame.com/sitemap.xml" ,
filter_urls = [
"https://beta.thisisgame.com/articles/265823"
],
)
sitemap_loader . requests_per_second = 2
docs = sitemap_loader . load ()
docs
Add custom scraping rules
Copy % pip install --user - Uq beautifulsoup4
Copy from bs4 import BeautifulSoup
def remove_nav_and_header_elements ( content : BeautifulSoup) -> str :
# Find all 'nav' and 'header' elements in the BeautifulSoup object
nav_elements = content . find_all ( "nav" )
header_elements = content . find_all ( "header" )
# Remove each 'nav' and 'header' element from the BeautifulSoup object
for element in nav_elements + header_elements :
element . decompose ()
return str (content. get_text ())
Copy loader = SitemapLoader (
"https://api.python.langchain.com/sitemap.xml" ,
filter_urls = [ "https://api.python.langchain.com/en/latest/" ],
parsing_function = remove_nav_and_header_elements,
)
Copy docs = loader . load ()
docs
Copy from bs4 import BeautifulSoup
def parse_page ( soup ):
header = soup . find ( "header" )
footer = soup . find ( "footer" )
if header :
header . decompose ()
if footer :
footer . decompose ()
return (
str (soup. get_text ())
. replace ( "\n" , " " )
. replace ( "\xa0" , " " )
. replace ( "์งํ๊ฒ์๊ณ ํ์ ์ธ๊ฒ์ ์ด์์คํผ๋์ธ๊ธฐํ/ํน์ง์ฐ์ฌ/์นดํฐ๊ฐค๋ฌ๋ฆฌ์ปค๋ฎค๋ํฐ ๋ก๊ทธ์ธ๋ก๊ทธ์ธ" , "" )
)
pdf Document Loader
Copy from langchain_community . document_loaders import PyPDFLoader
loader = PyPDFLoader ( "data/sample.pdf" )
pages = loader . load_and_split ()
Copy
print (pages[ 0 ].page_content)
MS Word Document Loader
https://python.langchain.com/v0.2/docs/integrations/document_loaders/microsoft_word/
API Reference:Docx2txtLoader (https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.word_document.Docx2txtLoader.html)
Copy % pip install --user - Uq docx2txt
from langchain_community . document_loaders import Docx2txtLoader
loader = Docx2txtLoader ( "data/family.docx" )
data = loader . load ()
data
Last updated 7 months ago