commit acda3bf3344261b39f76c7bfe503e5d0f94ef953 Author: Harsimran Date: Fri Feb 28 01:25:10 2025 +0530 first commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d1935b0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +# Python bytecode files +*.pyc +*.pyo +__pycache__/ + +# Virtual environment directories +env/ +venv/ +ENV/ +.venv/ + + +# Configuration files +*.env +*.env.* + +# IDE or editor settings (e.g., VSCode, PyCharm) +.vscode/ +.idea/ + +# Python packaging +*.egg +*.egg-info/ +dist/ +build/ +*.tar.gz + + +faiss_index/ + +.env \ No newline at end of file diff --git a/Book1.pdf b/Book1.pdf new file mode 100644 index 0000000..c2dd8b2 Binary files /dev/null and b/Book1.pdf differ diff --git a/Book2.pdf b/Book2.pdf new file mode 100644 index 0000000..6a5664f Binary files /dev/null and b/Book2.pdf differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..6253d1d --- /dev/null +++ b/README.md @@ -0,0 +1,66 @@ +# Ayurveda Chatbot using LLaMA and RAG + +This project is an interactive Ayurveda chatbot that uses a **Retrieval-Augmented Generation (RAG)** pipeline powered by the **LLaMA language model via Ollama**. The chatbot provides Ayurvedic knowledge and answers user queries based on pre-trained PDF content. + +--- + +## Features + +- **PDF Knowledge Base**: Pretrained on Ayurvedic texts for domain-specific answers. +- **RAG Pipeline**: Combines FAISS vector retrieval and LLaMA for context-aware responses. +- **Streamlit Interface**: Easy-to-use frontend for interacting with the chatbot. + +--- + +## Requirements + +- Python 3.8+ +- GPU support (optional but recommended for faster LLM inference) +- LLaMA model via [Ollama](https://ollama.ai) + +--- + +## Installation + +### 1. Clone the Repository +```bash +git clone https://github.com/your-username/ayurveda-chatbot.git +cd ayurveda-chatbot +``` + + +### 2. Create and Activate a Virtual Environment +On Linux/macOS: +```bash +python3 -m venv env +source env/bin/activate +``` +On Windows: +``` +python -m venv env +env\Scripts\activate +``` +3. Install Dependencies +```bash +pip install -r requirements.txt +``` + +4. Set .env file + +Usage +1. Preprocess PDF and Create FAISS Index +Ensure the PDF file (e.g., ayurveda_text.pdf) is placed in the project directory. + +Run the backend script to preprocess the data and create a FAISS index: + +```bash +python3 backend.py Book1.pdf Book2.pdf --index-path faiss_index +``` + +2. Start the Chatbot +Launch the Streamlit interface: + +```bash +streamlit run frontend.py +``` +Access the chatbot in your browser at http://localhost:8501. diff --git a/backend.py b/backend.py new file mode 100644 index 0000000..05012ba --- /dev/null +++ b/backend.py @@ -0,0 +1,66 @@ +import os +from langchain_community.document_loaders import PyPDFLoader +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.vectorstores import FAISS +from sentence_transformers import SentenceTransformer +from langchain.embeddings.base import Embeddings +from langchain.docstore.in_memory import InMemoryDocstore +import faiss + +# Define a custom embedding wrapper for LangChain +class SentenceTransformerEmbeddings(Embeddings): + def __init__(self, model_name="all-MiniLM-L6-v2"): + self.model = SentenceTransformer(model_name) + + def embed_documents(self, texts): + return self.model.encode(texts, show_progress_bar=True) + + def embed_query(self, text): + return self.model.encode([text], show_progress_bar=False)[0] + +# Function to create an empty FAISS index +def create_empty_faiss_index(embedding_model): + embedding_dimension = embedding_model.model.get_sentence_embedding_dimension() + index = faiss.IndexFlatL2(embedding_dimension) # Initialize FAISS index + docstore = InMemoryDocstore({}) + index_to_docstore_id = {} + return FAISS(index=index, docstore=docstore, index_to_docstore_id=index_to_docstore_id, embedding_function=embedding_model) + +# Function to update the FAISS index with new books +def update_faiss_index(book_paths, faiss_index_path="faiss_index"): + # Load or initialize FAISS index + embedding_model = SentenceTransformerEmbeddings() + if os.path.exists(faiss_index_path): + print("Loading existing FAISS index...") + db = FAISS.load_local(faiss_index_path, embedding_model) + else: + print("Creating a new FAISS index...") + db = create_empty_faiss_index(embedding_model) + + # Process each book + for book_path in book_paths: + print(f"Processing book: {book_path}") + loader = PyPDFLoader(book_path) + documents = loader.load() + + # Split text into chunks + text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) + chunks = text_splitter.split_documents(documents) + texts = [chunk.page_content for chunk in chunks] + + # Add embeddings to FAISS index + db.add_texts(texts) + + # Save the updated FAISS index + db.save_local(faiss_index_path) + print(f"FAISS index updated and saved at: {faiss_index_path}") + +# Command-line interface +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser(description="Update FAISS index with new books") + parser.add_argument("books", nargs="+", help="Path(s) to the PDF book(s)") + parser.add_argument("--index-path", default="faiss_index", help="Path to FAISS index directory") + args = parser.parse_args() + + update_faiss_index(args.books, args.index_path) diff --git a/frontend.py b/frontend.py new file mode 100644 index 0000000..6b9b39a --- /dev/null +++ b/frontend.py @@ -0,0 +1,113 @@ +import os +import streamlit as st +from dotenv import load_dotenv +from langchain_community.vectorstores import FAISS +from sentence_transformers import SentenceTransformer +from langchain.embeddings.base import Embeddings +import requests # To handle HTTP requests for Groq API + +# Load environment variables from .env file +load_dotenv() + +# Set page configuration +st.set_page_config(page_title="Ayurveda Chatbot", layout="wide") + +# Check for the GROQ_API_KEY environment variable +groq_key = os.getenv("GROQ_API_KEY") +if not groq_key: + st.error("The 'GROQ_API_KEY' environment variable is not set. Please set it in the .env file or the environment.") +else: + st.write(f"GROQ_KEY loaded successfully") + +# Define a custom embedding wrapper for LangChain +class SentenceTransformerEmbeddings(Embeddings): + def __init__(self, model_name="all-MiniLM-L6-v2"): + self.model = SentenceTransformer(model_name) + + def embed_documents(self, texts): + return self.model.encode(texts, show_progress_bar=True) + + def embed_query(self, text): + return self.model.encode([text], show_progress_bar=False)[0] + +# Path to FAISS index +faiss_index_path = "faiss_index" + +# Load FAISS Index with dangerous deserialization enabled +embedding_model = SentenceTransformerEmbeddings() +try: + db = FAISS.load_local(faiss_index_path, embedding_model, allow_dangerous_deserialization=True) +except Exception as e: + st.error(f"Failed to load FAISS index: {str(e)}") + db = None + +# Define the class to handle API calls to Groq +class GroqAPI: + def __init__(self, api_key): + self.api_key = api_key + self.endpoint = "https://api.groq.com/openai/v1/chat/completions" + + def generate_answer(self, query, context, model="llama-3.3-70b-versatile"): + # Prepare the system message + system_message = ( + "You are an Ayurvedic expert with deep knowledge of Ayurvedic practices, remedies, and diagnostics. " + "Use the provided Ayurvedic context to answer the question thoughtfully and accurately.\n\n" + f"Context:\n{context}\n\n" + f"Question:\n{query}\n\n" + "Answer as an Ayurvedic expert:" + ) + + payload = { + "model": model, + "messages": [ + {"role": "system", "content": system_message}, + {"role": "user", "content": query} + ] + } + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + try: + response = requests.post(self.endpoint, json=payload, headers=headers) + if response.status_code == 200: + result = response.json() + return result["choices"][0]["message"]["content"] + else: + return f"Error: {response.status_code} - {response.text}" + except Exception as e: + return f"Error: {str(e)}" + +# Initialize the GroqAPI +groq_api = GroqAPI(api_key=groq_key) + +# Custom QA chain function that integrates FAISS and Groq API +def custom_qa_chain(query): + if not db: + return "FAISS index is not loaded." + try: + # Retrieve relevant context from FAISS index + context = db.similarity_search(query, k=3) + context_text = "\n".join([doc.page_content for doc in context]) + + # Get the response from Groq API + response = groq_api.generate_answer(query, context_text) + except Exception as e: + response = f"Error during QA chain: {str(e)}" + + return response + +# Streamlit UI +st.title("Ayurveda Chatbot") + +st.subheader("Ask your Ayurvedic Question") +query = st.text_input("Enter your query:") +if query: + with st.spinner("Retrieving answer..."): + st.write(f"Processing query: {query}") + + # Get the response from custom QA chain + response = custom_qa_chain(query) + + st.markdown(f"### Answer:\n{response}") diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..96188a0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,13 @@ +streamlit==1.41.1 +langchain==0.3.13 +langchain-community==0.3.13 +sentence-transformers==3.3.1 +faiss-cpu==1.9.0.post1 +PyPDF2==3.0.1 +torch==2.5.1 +transformers==4.47.1 +pandas==2.2.3 +numpy==1.26.4 +pypdf==5.1.0 +groq==0.15.0 +python-dotenv \ No newline at end of file