๊ฐ์
๋๊ฐ์ ๋จ๊ณ๋ก ๋๋ ์ง๋ค.
์ ์ฒ๋ฆฌ
์ ์ฒ๋ฆฌ๋ฅผ ํด์ ๊ฐ์ง๊ณ ์๋ ๋ชจ๋ ๋ฌธ์๋ฅผ embedํด์ ๋๋น์ ๋ฃ์ด๋๋ค.
์๋น์ค
์ ๋ฆฌ
Document Loader
https://python.langchain.com/v0.2/docs/integrations/document_loaders/
๋ง์๊ฒ์ค์ ์์์ ๊ณจ๋ผ์ฐ๋ฉด ๋๋ค.
https://python.langchain.com/v0.2/docs/integrations/document_loaders/unstructured_file/
์ผ๋จ ๋ ์ด๊ฑธ๋ก ์ ํ
Copy % pip install --user - Uq unstructured
Copy from dotenv import load_dotenv
load_dotenv ()
from langchain_openai import ChatOpenAI
from langchain_community . document_loaders import UnstructuredMarkdownLoader
loader = UnstructuredMarkdownLoader ( "files/kubernetes_hardening_guidance.md" )
docs = loader . load ()
docs
Document Splitter
๋ํ๋จผํธ 1๊ฐ๋ก ๋ก๋๊ฐ ๋๋ค. ์ด ๋ฌธ์๋ฅผ ๋ค llm์ผ๋ก ๋ณด๋ด๋ ๋๋ ๋น์ฉ๋ฌธ์ ๊ฐ ๋ฐ์ํ๋ค.
๊ทธ๋ฆฌ๊ณ llm์์ ์ต๋ ํฌ๊ธฐ๊ฐ ์์ด์ ์ ๋ถ๋ค ํ๊บผ๋ฒ์ ๋ณด๋ด์ง ๋ชปํ๋ค.
ํ์์๋๋ถ๋ถ๊น์ง ๋ณด๋ด์ ๋น์ฉ์ ๋ด๋๊ฒ๋ณด๋ค ์งค๋ผ์ ํ์ํ ๋ถ๋ถ๋ง ๋ณด๋ด๋ฉด ๋ ์ข์๊ฑฐ๊ฐ๋ค.
Copy from langchain . text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter ()
docs = loader . load_and_split (text_splitter = splitter)
docs
14๊ฐ๋ก ์งค๋ผ์ง๊ฑธ ๋ณผ์ ์๋ค.
๋ ๋ง์ด ์๋ผ๋ณด์.
Copy from langchain . text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter (
chunk_size = 200
)
docs = loader . load_and_split (text_splitter = splitter)
docs
14๊ฐ๊ฐ 2482๋ก ๋์ด๋ฌ๋ค. ๊ทผ๋ฐ ๋ฌธ์ฅ์ด ์งค๋ฆผ.. llm์ด ์ดํด๋ฅผ ๋ชปํจ.
overlap์ ์ฌ์ฉํ์.
Copy from langchain . text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter (
chunk_size = 200 ,
chunk_overlap = 50 ,
separators = [ "\n" , "\r\n" ],
length_function = len ,
)
docs = loader . load_and_split (text_splitter = splitter)
docs
openai๋ length๋ฅผ ์ฌ์ฉํ์ง ์๊ณ token์ ์ฌ์ฉํ๋ค.
https://platform.openai.com/tokenizer
ํ ํฐ๊ณผ character๊ฐ ๋ค๋ฅธ๊ฑธ ์์์๋ค.
์ฐ๋ฆฌ๋ ๋ฌธ์๋ฅผ token๋จ์๋ก ์ดํดํด์ผํ ํ์๊ฐ ์๋ค.
๋งํฌ์์ tikoken ํจํค์ง๋ฅผ ์ถ์ฒํ๋ค.
https://github.com/openai/tiktoken
์ฌ์ฉํ์.
Copy % pip install --user - Uq tiktoken
Copy from langchain . text_splitter import CharacterTextSplitter
splitter = CharacterTextSplitter . from_tiktoken_encoder (
chunk_size = 200 ,
chunk_overlap = 50 ,
separator = "\n" ,
)
docs = loader . load_and_split (text_splitter = splitter)
len (docs)
๋ฌธ์๊ฐ ์ ๋๋ ์ก๋ค. ์ด์ ์๋ฒ ๋ฉ์ ํด์ ๋๋น์ ๋ฃ์ด๋ณด์.
Embedding ํ๊ธฐ
๋ฌธ์๋ฅผ ์ซ์๋ก ๋ฐ๊ฟ์ ๋๋น์ ๋ฃ์์ค๋น๋ฅผ ํ๋๊ฒ์ ์ด ์ซ์๋ฅผ ๊ธฐ์ค์ผ๋ก ์ ์ฌ๋๋ฅผ ๊ฒ์ํ๋ค.
์ฃผ์์ฌํญ์ ์๋ฒ ๋ฉ ๋ฐฉ์์ ์ง๊ธ ๊ฒฐ์ ํ๋ฉด ๋์ค์ query๋ฅผ ํ ๋๋ ๊ฐ์ ๋ฐฉ์์ผ๋ก ํด์ผํ๋ค.
openai embeding์ ์ด์ฉํฉ๋๋ค.๋ค๋ฅธ๊ฒ๋ค๋ ์์ผ๋ ์ฐพ์์ ๋ณด์์
https://github.com/turbomaze/word2vecjson ์ ์ฐธ๊ณ ํ์์
๋ ์ผ๋จ ๋ชฐ๋ผ๋ ๋๋๊ฑฐ๊ฐ์์ ๋ฐ๋ก ์ฝ๋๋ก ์งํ
https://python.langchain.com/v0.2/docs/integrations/text_embedding/openai/
OpenAI Embedding์ ์ฌ์ฉํ์.
Copy from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings (model = "text-embedding-3-large" )
text = "This is a test document."
vector = embeddings . embed_query (text)
vector
Copy doc_result = embeddings . embed_documents ([text])
doc_result [ 0 ] [ : 5 ]
Copy print ( len (doc_result))
len (doc_result[ 0 ])
Copy embeddings = OpenAIEmbeddings (
model = "text-embedding-3-large" , dimensions = 1024 )
Copy len (embeddings. embed_documents ([text])[ 0 ])
Embedding์ ๋๋น(FAISS) ์ ๋ฃ๊ธฐ
์๋ฒ ๋ฉ์ ํด์ FAISS ์ ์
๋ ฅํ๊ธฐ
Copy % pip install --user - Uq faiss - cpu
# %pip install --user -Uq faiss-gpu
Copy from langchain_openai import OpenAIEmbeddings
from langchain . document_loaders import UnstructuredFileLoader
from langchain . text_splitter import CharacterTextSplitter
from langchain . storage import LocalFileStore
from langchain_community . document_loaders import UnstructuredMarkdownLoader
splitter = CharacterTextSplitter . from_tiktoken_encoder (
separator = "\n" ,
chunk_size = 600 ,
chunk_overlap = 100 ,
)
loader = UnstructuredMarkdownLoader ( "files/kubernetes_hardening_guidance.md" )
docs = loader . load_and_split (text_splitter = splitter)
Copy from langchain . vectorstores import FAISS
embeddings = OpenAIEmbeddings ()
# ์๋ฒ ๋ฉ์ ํด์ ๋๋น์ ๋ฃ๋๋ค.
vectorstore = FAISS . from_documents (docs, embeddings)
Copy
results = vectorstore . similarity_search ( "๋๊ตฌ์๊ฒ ์ด ๊ฐ์ด๋๋ฅผ ์ถ์ฒํ๋๊ฐ?" )
results
์ ์ฌ๋ ๊ฒ์ฌ๋ฅผ ํด์ ๋ฌธ์ฅ์ ์ ์ฐพ์์จ๋ค. ์ด๊ฑธ ํ๋กฌํํธ๋ก llm์ ๋ณด๋ด๋ฉด ๋๋ค.
๋ณดํต 3๊ฐ๋ 5๊ฐ๊ฐ ์ข๋ค๊ณ ํ๋ค.
Embedding cache
ํ ๋๋ง๋ค ๋น์ฉ์ด ๋ฐ์ํ๋ค.
๋๋น์ ์
๋ ฅํ ๊ฒ์ ๋ก์ปฌ ๋๋ผ์ด๋ธ์ cache๋ฅผ ํ์. ๊ทธ๋ฌ๋ฉด openai์ ๋ค์ ์์ฒญํ์ง ์์์ ์๊ธ์ด ์ค์ด๋ ๋ค.
Copy # caching
from langchain . embeddings import CacheBackedEmbeddings
embeddings = OpenAIEmbeddings ()
cache_dir = LocalFileStore ( "./cache/" )
cached_embeddings = CacheBackedEmbeddings . from_bytes_store (embeddings, cache_dir)
# vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore = FAISS . from_documents (docs, cached_embeddings)
Copy # ๊ฒ์
results = vectorstore . similarity_search ( "๋๊ตฌ์๊ฒ ์ด ๊ฐ์ด๋๋ฅผ ์ถ์ฒํ๋๊ฐ?" )
results
์ด์ ์ด ๊ฒฐ๊ณผ๋ฅผ llm์ ๋ณด๋ผ๊ฐ?
๊ฒ์๋ ๊ฒฐ๊ณผ๋ฅผ prompt์ ๋ฃ๊ณ llm์ ๋ณด๋ด๊ธฐ
Copy from langchain . prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
retriver = vectorstore . as_retriever ()
prompt = ChatPromptTemplate . from_messages (
[
( "system" ,
"""
You are a helpful AI talking to a human, Answer questions using only the following context.
If you don't know the answer just say you don't know, don't make it up:
{context}
""" ),
( "human" , " {question} " ),
]
)
llm = ChatOpenAI (temperature = 0.1 )
Copy
from langchain . schema . runnable import RunnablePassthrough
chain = ( {
"context" : retriver ,
"question" : RunnablePassthrough (),
}
| prompt | llm
)
Copy
chain . invoke ( "๋๊ตฌ์๊ฒ ์ด ๊ฐ์ด๋๋ฅผ ์ถ์ฒํ๋๊ฐ?" )
DocumentsChain (๋ฌธ์๋ฅผ vector์์ ๊ฐ์ ธ์์ ์ฒ๋ฆฌํ๋ ๋ฐฉ์)
stuff - vector ๊ฒ์์ ๊ฑธ๋ฆฐ ๋ชจ๋ ๋ฌธ์๋ฅผ prompt์ ๋ฃ์ด์ ๋ณด๋ธ๋ค.
๊ทธ๋ฆผ์ผ๋ก ์ค๋ช
stuff ๊ฒ์ ๊ฒฐ๊ณผ ๋ฌธ์๋ฅผ prompt์ ๋ฃ์ด์ ๋ณด๋ด์.
stuff๋ผ๊ณ ๋ถ๋ฅด๋๋ฐ .
์ด๊ฑด ์ ์ํ์์ ์ฒ๋ฆฌํ๋ค. https://python.langchain.com/v0.2/docs/tutorials/summarization/#stuff
๋ชจ๋ ๋ฌธ์๋ฅผ ๋จ์ผ ํ๋กฌํํธ์ "์ฑ์ฐ๊ธฐ"๋ง ํ๋ฉด ๋ฉ๋๋ค. ์ด๊ฒ์ด ๊ฐ์ฅ ๊ฐ๋จํ ์ ๊ทผ ๋ฐฉ์์
๋๋ค
We can use chain_type="stuff", especially if using larger context window models such as:
128k token OpenAI gpt-4-turbo-2024-04-09 200k token Anthropic claude-3-sonnet-20240229
beautifulsoup4 ํจํค์ง๋ก ์น์ ํฌ๋กค๋ง ํด์ค๊ณ ๊ทธ ๋ฐ์ดํฐ๋ฅผ llm์ ๊ฐ์ด ๋ฃ์ด์ ์์ฝ์ ๋ฐ์.
ํ์ผ์ด ํฌ๋๊น. ํฐ๋ชจ๋ธ๋ก ํ์.
map reduce
"๋งต" ๋จ๊ณ์์ ๊ฐ ๋ฌธ์๋ฅผ ์์ฒด์ ์ผ๋ก ์์ฝํ ๋ค์ ์์ฝ์ ์ต์ข
์์ฝ์ผ๋ก "์ถ์"ํฉ๋๋ค
https://python.langchain.com/v0.2/docs/tutorials/summarization/#map-reduce
Copy from langchain_openai import OpenAIEmbeddings
from langchain . embeddings import CacheBackedEmbeddings
from langchain . vectorstores import FAISS
embeddings = OpenAIEmbeddings ()
cached_embeddings = CacheBackedEmbeddings . from_bytes_store (
embeddings, cache_dir)
vectorstore = FAISS . from_documents (docs, cached_embeddings)
retriever = vectorstore . as_retriever ()
vector store์์ ๊ฒ์์ ํ๋๋ก ์ฒด์ธ์ ๊ตฌ์ฑ
Copy from langchain . prompts import ChatPromptTemplate
from langchain . schema . runnable import RunnablePassthrough , RunnableLambda
map_doc_prompt = ChatPromptTemplate . from_messages (
[
(
"system" ,
"""
Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
-------
{context}
""" ,
),
( "human" , " {question} " ),
]
)
map_doc_chain = map_doc_prompt | llm
def map_docs ( inputs ):
documents = inputs [ "documents" ]
question = inputs [ "question" ]
return "\n\n" . join (
map_doc_chain. invoke (
{ "context" : doc.page_content, "question" : question}
).content
for doc in documents
)
map_chain = {
"documents" : retriever ,
"question" : RunnablePassthrough (),
} | RunnableLambda (map_docs)
๊ฒฐ๊ณผ ๋์จ ๋ด์ฉ์ ๋ค์ ๋ง์ง๋ง prompt์ ๋ฃ์ด์ ์ต์ข
์ฟผ๋ฆฌ llm์ ๋ณด๋ด๊ธฐ
Copy final_prompt = ChatPromptTemplate . from_messages (
[
(
"system" ,
"""
Given the following extracted parts of a long document and a question, create a final answer.
If you don't know the answer, just say that you don't know. Don't try to make up an answer.
------
{context}
""" ,
),
( "human" , " {question} " ),
]
)
chain = { "context" : map_chain , "question" : RunnablePassthrough ()
} | final_prompt | llm
์์ฒญํด๋ณด๊ธฐ
Copy
chain . invoke ( "์์ฝํด์ค" )
Refine
์ด๊ฑด ์ง์ ํด๋ณด์์
๋ฌธ์๋ฅผ ์์ฐจ์ ์ผ๋ก ์์ฝํ๋๋ฐ ๊ธฐ์กด์ ์์ฝ๋ ๊ฒ๊ณผ ํจ๊ป ๋ค์ ์์ฝ์ ํ๋ ๋ฐฉ์ ์ผ๋ก ์
๋ฐ์ดํธํฉ๋๋ค.