AIML – Tech Blogs

Embedding, Huggingface, LanceDB, LLM, RAG

How to Build a Simple Retrieval-Augmented Generation (RAG) System

In this post, we’ll walk through a working RAG pipeline using climate science data from the IPCC to produce grounded, factual, and transparent responses with an LLM.

What Is RAG?

Retrieval-Augmented Generation (RAG) enhances LLMs by injecting context from a knowledge base during inference. Unlike traditional LLMs that may hallucinate or become outdated, RAG ensures:

Easy to update—just replace the documents, no retraining needed
Answers are grounded in your data (e.g., reports, PDFs)
Reduces hallucinations by injecting factual context

A Simple Architecture: How It Works

Step 1: Parsing & Chunking

We use docling and a hybrid tokenizer for clean document splitting:

# chunker.py
import os

from docling import chunking
from docling.document_converter import DocumentConverter
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from transformers import AutoTokenizer


class Chunker:
    def __init__(self, embedding_model: str, max_tokens: int = 1024):
        tokenizer = HuggingFaceTokenizer(
            tokenizer=AutoTokenizer.from_pretrained(embedding_model),
            max_tokens=max_tokens,
        )
        self.__chunker = chunking.HybridChunker(tokenizer=tokenizer, merge_peers=True)

    def chunk(self, source: str):
        doc = DocumentConverter().convert(source=source).document
        chunk_iter = self.__chunker.chunk(dl_doc=doc)
        chunks = list(chunk_iter)
        chunks_dicts = []
        for chunk in chunks:
            chunks_dicts.append(
                {
                    "content": chunk.text,
                    "page_number": chunk.meta.doc_items[0].prov[0].page_no,
                    "pdf_name": os.path.basename(source),
                }
            )
        return chunks_dicts

Step 2: Embedding with SentenceTransformer

# embedding.py
from typing import List
import torch
from sentence_transformers import SentenceTransformer

class CustomEmbeddings:

    def __init__(
            self,
            model_name: str,
            trust_remote_code: bool = True,
            device: str = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
            normalize_embeddings: bool = True,
    ):
        self.model_name = model_name
        self.normalize_embeddings = normalize_embeddings
        self.model = SentenceTransformer(
            model_name,
            trust_remote_code=trust_remote_code,
            device=device,
        )

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = self.model.encode(
            texts,
            normalize_embeddings=self.normalize_embeddings,
            convert_to_tensor=False
        )
        return embeddings.tolist()

    def embed_query(self, text: str) -> List[float]:
        embedding = self.model.encode(
            text,
            normalize_embeddings=self.normalize_embeddings,
            convert_to_tensor=False
        )
        return embedding.tolist()

    @property
    def embedding_dimension(self) -> int:
        """Get the dimension of the embeddings."""
        return self.model.get_sentence_embedding_dimension()

Step 3: Indexing & Semantic Search with LanceDB

#lancedb.py

import lancedb
from lancedb.table import Table
from pandas import DataFrame

class LanceDB:

    def __init__(self,
                 vector_storage_path: str = "./lancedb/vector_storage",
                 table_name: str = "knowledge_base"):
        db = lancedb.connect(uri=vector_storage_path)
        import pyarrow as pa
        schema = pa.schema([
            pa.field("content", pa.string()),
            pa.field("page_number", pa.int32()),
            pa.field("pdf_name", pa.string()),
            pa.field("embeddings", pa.list_(pa.float32(), 1024)),
        ])
        try:
            db.create_table(table_name, schema=schema)
            print(f"Table {table_name} created successfully.")
        except Exception as e:
            print(f"Table {table_name} already exists. {e}")
        self.__table: Table = db.open_table(name=table_name)

    def semantic_search(self, vector_query: list[float], n: int = 10, distance_threshold=0.50) -> DataFrame:
        search_results = self.__table.search(vector_query, vector_column_name="embeddings").distance_type(
            "cosine").limit(n).to_pandas()
        print(f"search_results\n\n {search_results}")
        return search_results.loc[search_results["_distance"] <= distance_threshold]

    def get_count(self) -> int:
        return self.__table.count_rows()

    def save(self, df: DataFrame):
        self.__table.add(df)
        print(f"total records in lancedb : {self.__table.count_rows()}")

    def create_index(self):
        try:
            self.__table.create_index(metric="cosine", vector_column_name="embeddings")
        except Exception as e:
            print(f"Seems index already exist {e}")

Step 4: Prompt Template

# prompt_template.py

class PromptTemplate:
    @staticmethod
    def build(context: str, question: str, max_token: int = 512) -> str:
        prompt = f"""You are a Climate Science Assistant using IPCC research to explain climate change clearly and compassionately.

**Your Approach:**
- Use solid IPCC scientific evidence
- Explain concepts accessibly for all audiences
- Be honest about uncertainties while providing clear guidance
- Support responses with specific data and findings
- Remain helpful, accurate, and encouraging
- **Keep responses under {max_token} tokens**

**Available Scientific Context (IPCC 2023 Synthesis Report):**
{context}

**Question:**
{question}

**Your Response (max {max_token} tokens):**

        """
        return prompt

Step 5: LLM Inference Using Qwen

# qwen_llm.py
from typing import List, Dict, Tuple

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

class QwenLLM:

    def __init__(self, model_name: str = "Qwen/Qwen3-1.7B"):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.device = None
        self._load_model()

    def _load_model(self) -> None:
        print(f"Loading model: {self.model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            device_map="auto"
        )
        self.device = self.model.device
        print(f"Model loaded successfully on device: {self.device}")

    def _prepare_messages(self, prompt: str) -> List[Dict[str, str]]:
        return [{"role": "user", "content": prompt}]

    def _parse_thinking_content(self, output_ids: List[int]) -> Tuple[str, str]:
        try:
            # Find the index of </think> token (151668)
            index = len(output_ids) - output_ids[::-1].index(151668)
        except ValueError:
            # If </think> token not found, no thinking content
            index = 0
        thinking_content = self.tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
        main_content = self.tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

        return thinking_content, main_content

    def invoke(self,
               prompt: str,
               max_new_tokens: int = 1024,
               enable_thinking: bool = True,
               return_thinking: bool = True,
               **generation_kwargs) -> Dict[str, str]:
        messages = self._prepare_messages(prompt)
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=enable_thinking
        )
        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
        with torch.no_grad():
            generated_ids = self.model.generate(
                **model_inputs,
                max_new_tokens=max_new_tokens,
                **generation_kwargs
            )
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
        if enable_thinking and return_thinking:
            thinking_content, main_content = self._parse_thinking_content(output_ids)
            return {
                "response": main_content,
                "thinking": thinking_content
            }
        else:
            content = self.tokenizer.decode(output_ids, skip_special_tokens=True).strip("\n")
            return {
                "response": content,
                "thinking": ""
            }

Step 6: Putting It All Together

# rag_main.py
import pandas as pd

from src.chunker.chunker import Chunker
from src.embedding.custom_embedding import CustomEmbeddings
from src.llm.qwen_llm import QwenLLM
from src.prompt.prompt_template import PromptTemplate
from src.storage.lancedb import LanceDB

pdf_data = "https://www.ipcc.ch/report/ar6/syr/downloads/report/IPCC_AR6_SYR_LongerReport.pdf"

EMBEDDING_MODEL = "BAAI/bge-m3"
LLM_MODEL = "Qwen/Qwen3-1.7B"


# initialize the embedding model
embeddings = CustomEmbeddings(model_name=EMBEDDING_MODEL)

# initialize the LLM
llm = QwenLLM(model_name=LLM_MODEL)

# initialize the Chunker
chunker = Chunker(embedding_model=EMBEDDING_MODEL)

# initialize the Vector DB
lancedb = LanceDB(table_name="rag_table")
# Run document Indexing
print("Start Chunking ....")
documents = chunker.chunk(pdf_data)
print("Chunking done....")
df = pd.DataFrame(documents, columns=["content", "page_number", "pdf_name"])
print("Start Embedding ....")
df["embeddings"] = df["content"].apply(embeddings.embed_query)
print("Embedding  done....")
print(df)
print("Start saving ....")
lancedb.save(df)

# RAG
query = "How is climate change affecting biodiversity?"

vector_query = embeddings.embed_query(query)
result_df = lancedb.semantic_search(vector_query=vector_query, n=2)
context = "\n\n".join(result_df["content"].tolist())
formatted_prompt = PromptTemplate.build(context=context, question=query)
print("\nFormatted Prompt:" + "\n" + formatted_prompt)
final_response = llm.invoke(formatted_prompt, enable_thinking=True, return_thinking=True)
print("\nFinal RAG Response:")
print(final_response["response"])

Final Output Example

**Question:**
How is climate change affecting biodiversity?

**Your Response (max 512 tokens):**
Final RAG Response:
Climate change is profoundly impacting biodiversity through habitat loss, shifting species ranges, and ecosystem disruptions. For example, over 50% of coastal wetlands have been lost globally due to sea level rise, warming, and extreme events, threatening species like mangroves and sea turtles. Species are shifting poleward or uphill (very high confidence), but many cannot adapt fast enough to rising temperatures or extreme weather, leading to local extinctions (very high confidence). Heat extremes and mass mortality events (e.g., coral bleaching) have caused hundreds of species losses. Irreversible changes, such as glacier retreat altering freshwater systems, are accelerating. Ocean acidification and sea level rise also disrupt marine ecosystems. While some shifts occur, many ecosystems are approaching irreversible damage, underscoring the urgency of conservation and adaptive strategies to mitigate these impacts.

Why This Matters

Trustworthy: Cites real IPCC data, not LLM guesswork
Transparent: Users see the evidence used
Customizable: Swap in any dataset or document

Here is the code repository

AIML, LLM, vLLM, LiteLLM

Unified Inference Server with LiteLLM: Simplifying Multi-LLM Deployments

Managing multiple self-hosted LLMs can be challenging for developers. LiteLLM simplifies this by providing a single unified API endpoint for all your models, whether self-hosted or from providers like OpenAI, Gemini, and Anthropic.

We leverage open-source LLMs such as Llama 3.1 8B and Mistral 7B, along with BAAI/bge-m3 for embeddings primarily hosted on vLLM for efficient inference.

Let’s start with vLLM, vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs, featuring PagedAttention for optimized attention key-value memory management.

Our LLMs run on GPU nodes for accelerated inference, while embeddings are deployed on CPU nodes.

Launch vLLM on a GPU node(A10 on AWS):

Mistral-7B

docker run  --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=hf_XXXXXXX" \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:latest \
    --model mistralai/Mistral-7B-Instruct-v0.3 \
    --quantization bitsandbytes

Llama 3.1 8B

docker run  --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HUGGING_FACE_HUB_TOKEN=hf_XXXXXXX" \
    -p 8000:8000 \
    --ipc=host \
    vllm/vllm-openai:latest \
    --model meta-llama/Llama-3.1-8B-Instruct\
    --quantization bitsandbytes

Embedding Model(bge-m3) on CPU node:

vLLM does not provide the CPU-based docker image so I built the image from the source:

docker run -v  ~/.cache/huggingface:/root/.cache/huggingface  -p 8000:8000 satendra/vllm-cpu:v0.8.3  --model BAAI/bge-m3

LiteLLM is a unified platform simplifying access to over 100 large language models (LLMs), providing an OpenAI-compatible API along with features such as usage tracking, fallback handling, and seamless integration for scalable inference.

To start the LiteLLM server, I am using the docker compose file:

version: "3.11"
services:
  litellm:
    build:
      context: .
      args:
        target: runtime
    image: ghcr.io/berriai/litellm:main-stable
    #########################################
    ## Uncomment these lines to start proxy with a config.yaml file ##
    volumes:
      - ./config.yaml:/app/config.yaml
    command:
      - "--config=/app/config.yaml"
     # - "--detailed_debug"
    ##############################################
    ports:
      - "4000:4000" # Map the container port to the host, change the host port if necessary
    environment:
        DATABASE_URL: "postgresql://llmproxy:dbpassword9090@db:5432/litellm"
        STORE_MODEL_IN_DB: "True" # allows adding models to proxy via UI
    env_file:
      - .env # Load local .env file
    depends_on:
      - db  # Indicates that this service depends on the 'db' service, ensuring 'db' starts first
    healthcheck:  # Defines the health check configuration for the container
      test: [ "CMD", "curl", "-f", "http://localhost:4000/health/liveliness || exit 1" ]  # Command to execute for health check
      interval: 30s  # Perform health check every 30 seconds
      timeout: 10s   # Health check command times out after 10 seconds
      retries: 3     # Retry up to 3 times if health check fails
      start_period: 40s  # Wait 40 seconds after container start before beginning health checks

 
  db:
    image: postgres:16
    restart: always
    environment:
      POSTGRES_DB: litellm
      POSTGRES_USER: llmproxy
      POSTGRES_PASSWORD: dbpassword9090
    ports:
      - "5432:5432"
    volumes:
      - postgres_data:/var/lib/postgresql/data  # Persists Postgres data across container restarts
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -d litellm -U llmproxy"]
      interval: 1s
      timeout: 5s
      retries: 10
  
  prometheus:
    image: prom/prometheus
    volumes:
      - prometheus_data:/prometheus
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
    ports:
      - "9090:9090"
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=15d'
    restart: always

volumes:
  prometheus_data:
    driver: local
  postgres_data:
    name: litellm_postgres_data  # Named volume for Postgres data persistence

Here is .env file:

LITELLM_MASTER_KEY="sk-fkFihkMb8CDA02vkt7Yr"
LITELLM_SALT_KEY="sk-6psYkATe5ZM1GRdBqvws"

Here is the config file:

model_list:
  - model_name: "Llama-3.1-8B"
    litellm_params:
      model: "hosted_vllm/meta-llama/Llama-3.1-8B-Instruct"
      api_base: http://xx.xxx.90.3:8000/v1

  - model_name: "Mistral-7B"
    litellm_params:
      model: "hosted_vllm/mistralai/Mistral-7B-Instruct-v0.3"
      api_base: http://xx.xxx.80.1:8000/v1

  - model_name: "BAAI/bge-m3" # embedding model
    litellm_params:
      model: "hosted_vllm/BAAI/bge-m3"
      api_base: http://xx.xxx.72.8:8000/v1

Launch the LiteLLM server with Docker Compose

docker compose up

Console output:

litellm-1     | 
litellm-1     | INFO:     Started server process [1]
litellm-1     | INFO:     Waiting for application startup.
litellm-1     | INFO:     Application startup complete.
litellm-1     | INFO:     Uvicorn running on http://0.0.0.0:4000 (Press CTRL+C to quit)

Access all your models through a single unified endpoint!

Here is Litellm API doc:

http://xx.0.139.5:4000/

Access the LiteLLM UI:

http://xx.0.139.5:4000/ui

Here is the dashboard:

http://xx.0.139.5:4000/ui/?userID=default_user_id

Test the Mistral model with the OpenAI-compatible SDK:

from openai import OpenAI

client = OpenAI(
    base_url = "http://xx.0.139.5:4000",
    api_key='fkFihkMb8CDA02vkt7Yr', 
)

response = client.chat.completions.create(
    model="Mistral-7B",
    messages = [
        {
            "role": "user",
            "content": "What are best books for Deep leaning?"
        }
    ],
    stream=False
)
print(f'LiteLLM:  response {response}')

Response from LLM:

ChatCompletion(id='chatcmpl-e708556a24e84af4af6276439209ae53', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=" There are numerous valuable resources for deep learning, catering to different levels of expertise. Here's a brief list of some highly recommended books on the subject:\n\n1. **Deep Learning** by Yoshua Bengio, Ian Goodfellow, and Aaron Courville: This book is a comprehensive resource for deep learning and covers both theory and practice. It's widely considered one of the best books for beginners looking to learn deep learning concepts.\n\n2. **Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow** by Aurelien Geron: This practical guide uses Python to delve into various machine learning techniques, with a significant focus on deep learning with Keras and TensorFlow.\n\n3. **Neural Networks and Deep Learning** by Michael Nielsen: This free online textbook offers an excellent introduction to the principles and techniques behind neural networks and deep learning. It is often recommended for self-study.\n\n4. **Deep Learning: A Practical Introduction** by Stephen Merity: Built around exercises, this book offers a broad and practical understanding of deep learning and associated techniques. It's a good choice for learners who prefer a hands-on approach.\n\n5. **Probabilistic Reasoning in Intelligent Systems: Networks of Plausible Inference** by Judea Pearl: Although not specifically focused on deep learning, this book provides foundational knowledge about probability theory, a crucial aspect of deep learning, and also learning doing-style content through its numerous exercises.\n\n6. **Reinforcement Learning: An Introduction** by Richard S. Sutton and Andrew G. Barto: This book provides an in-depth exploration of reinforcement learning algorithms, including those used extensively in deep learning for sequential decision making tasks.", refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=None))], created=1743945161, model='hosted_vllm/mistralai/Mistral-7B-Instruct-v0.3', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=387, prompt_tokens=11, total_tokens=398, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None)

Let’s try using a cURL request:

 curl --location 'http://xx.0.139.5:4000/chat/completions' \
     -H 'Ocp-Apim-Subscription-Key: fkFihkMb8CDA02vkt7Yr' \
  --header 'Content-Type: application/json' \
  --data ' {
  "model": "Mistral-7B",
  "messages": [
      {
      "role": "user",
      "content": "What are best books for Deep leaning?"
      }
  ]
}'

Response:

{"id":"chatcmpl-f6cc7c37dfa947e5ac2d111bb1c94783","created":1743945232,"model":"hosted_vllm/mistralai/Mistral-7B-Instruct-v0.3","object":"chat.completion","system_fingerprint":null,"choices":[{"finish_reason":"stop","index":0,"message":{"content":" Deep learning is a rapidly evolving field, and there are several excellent books that provide a solid foundation for both beginners and experts. Here are some recommendations, categorized by difficulty level:\n\n**Beginners:**\n1. \"Deep Learning with Python\" by François Chollet: This book is a comprehensive guide to deep learning with the Keras library, valuable for programmers who want to build and deploy their own models.\n2. \"Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow\" by Aurélio Oleacretta, Georgios Yannakoudakis, and V. M. Gerosolomou: In this book, you will learn machine learning concepts along with how to implement them using popular libraries like Scikit-Learn and TensorFlow.\n3. \"Deep Learning for Beginners: A Plain English Introduction\" by M. Isaac: This book provides an accessible deep learning intro, focusing on the fundamental principles without getting lost in mathematical details.\n\n**Intermediate:**\n1. \"Neural Networks and Deep Learning\" by Michael Nielsen: This online textbook provides an in-depth exploration of neural networks and deep learning, offering clear explanations and working code examples in Python.\n2. \"Reinforcement Learning: An Introduction\" by Richard S. Sutton and Andrew G. Barto: Although not focused exclusively on deep learning, this book offers essential reinforcement learning concepts, vital for understanding many deep learning techniques.\n\n**Advanced:**\n1. \"Deep Learning\" by Yoshua Bengio, Ian Goodfellow, and Aaron Courville: This landmark book provides a detailed treatment of deep learning theory, algorithms, and applications.\n2. \"Building Machine Learning Systems\" by Carlos Guestrin: In this book, you'll learn about the key mechanisms involved in building large-scale machine learning systems, paying particular attention to challenges in handling big data.\n3. \"Adversarial Robustness: Deep Learning Under Attack\" by Marco Rocha, Raymond T.L. Chan, Cesar A. de la Salud, and Rafael H. Perera: This book focuses on adversarial attacks and defenses in deep learning, which are critical aspects of ensuring the security and robustness of deep learning systems.\n4. \"The Deep Learning Landscape\" by Iasef Badr, Tapas Kanjilal, and Enda Admiraal: This thorough overview of deep learning offers insight into various modules, techniques, and applications in the field.","role":"assistant","tool_calls":null,"function_call":null}}],"usage":{"completion_tokens":549,"prompt_tokens":11,"total_tokens":560,"completion_tokens_details":null,"prompt_tokens_details":null},"service_tier":null,"prompt_logprobs":null}

Access embeddings through the OpenAI-compatible API:

import openai
client = openai.OpenAI(api_key="fkFihkMb8CDA02vkt7Yr", base_url="http://xx.0.139.5:4000")
text = "This is an example text for embedding."
# Call the OpenAI API for embedding
response = client.embeddings.create(
    model="BAAI/bge-m3",  # Specify the embedding model
    input=text
)
embedding = response.data[0].embedding
print(embedding)

Response:

[-0.044189453125, 0.0220184326171875, -0.0206146240234375, 0.00872802734375, -0.013885498046875, -0.0704345703125, 0.0809326171875, 0.033843994140625, 0.005382537841796875, 0.0191650390625, 0.005558013916015625, -0.0150909423828125, 0.006252288818359375, -0.01493072509765625, -0.0015430450439453125, -0.02874755859375, 0.037017822265625, -0.01390838623046875, -0.006366729736328125, -0.0174102783203125, -0.039886474609375, -0.0139007568359375, 0.0159912109375, -0.00714874267578125, -0.0169677734375, 0.03289794921875, 0.0211944580078125, -0.00640106201171875, -0.004283905029296875, -0.01470184326171875, 0.03271484375, 0.019439697265625, 0.03228759765625, -0.036529541015625, 0.015777587890625, -0.01111602783203125, 0.013031005859375, -0.004108428955078125, -0.0455322265625, 0.004856109619140625, -0.0024166107177734375, 0.0206146240234375, 0.005649566650390625, -0.050384521484375, 0.0294952392578125, -0.017303466796875, 0.00371551513671875, -0.0269775390625, -0.04510498046875, 0.01465606689453125, -0.02691650390625, 0.0162353515625, 0.046234130859375, 0.00984954833984375, -0.00640869140625, -0.00681304931640625, -0.035003662109375, -0.00595855712890625, -0.0677490234375, 0.012298583984375, -0.04803466796875, 0.0118865966796875, -0.0191650390625, 0.0092926025390625, 0.009246826171875, 0.06884765625, 0.01351165771484375, 0.007965087890625, -0.04248046875, -0.032867431640625, -0.010711669921875, 0.048828125, -0.05029296875, -0.0149078369140625, -0.0662841796875, 0.04010009765625, -0.01490020751953125, -0.034698486328125, -0.027008056640625, 0.055023193359375, 0.02593994140625, -0.0128173828125, 0.0159759521484375, 0.00600433349609375, -0.05010986328125, 0.029754638671875, -0.05029296875, 0.03155517578125, 0.0006766319274902344, -0.00939178466796875, -0.0128021240234375, -0.02008056640625, 0.032623291015625, -0.026031494140625, -0.016815185546875, 0.0158233642578125, 0.00876617431640625, 0.01898193359375, 0.029815673828125, 0.0246734619140625, -0.02105712890625, 0.00916290283203125, 0.0030384063720703125, 0.0033111572265625, 0.00839996337890625, -0.0257415771484375, 0.04888916015625, 0.0355224609375, 0.005954742431640625, -0.0170135498046875, 0.01727294921875, 0.045684814453125, 0.032135009765625, -0.0055694580078125, 0.002716064453125, -0.019561767578125, -0.040008544921875, -0.006061553955078125, -0.0234222412109375, -0.025115966796875, 0.022491455078125, 0.05877685546875, 0.047454833984375, -0.051971435546875, 0.05010986328125, -0.0276336669921875, -0.00875091552734375, 0.0175018310546875, 0.00787353515625, -0.0231475830078125, -0.01348876953125, 0.0291748046875, -0.017608642578125, -0.0022182464599609375, -0.0264434814453125, -0.01934814453125, 0.016845703125, 0.033538818359375, 0.05133056640625, -0.0516357421875, 0.056365966796875, -0.00995635986328125, -0.0239715576171875, -0.053314208984375, 0.049285888671875, -0.060638427734375, 0.0095977783203125, 0.0263671875, 0.027862548828125, -0.01091766357421875, -0.0565185546875, 0.019073486328125, -0.04840087890625, -0.00319671630859375, 0.00969696044921875, -0.045989990234375, 0.00470733642578125, 0.0158233642578125, 0.033935546875, 0.050872802734375, -0.057464599609375, -0.00223541259765625, -0.042724609375, -0.01125335693359375, -0.00481414794921875, 0.01267242431640625, 0.03912353515625, 0.043609619140625, -0.0162353515625, -0.046295166015625, -0.0156707763671875, 0.005771636962890625, 0.01702880859375, 0.0148468017578125, 0.00926971435546875, 0.02435302734375, 0.060791015625, 0.0127410888671875, 0.003871917724609375, -0.034332275390625, -0.01100921630859375, -0.0055084228515625, -0.01076507568359375, -0.0131378173828125, -0.0199737548828125, 0.020721435546875, 0.00505828857421875, -0.013641357421875, -0.00861358642578125, 0.020233154296875, -0.02349853515625, -0.035797119140625, 0.009429931640625, 0.00803375244140625, 0.0031681060791015625, 0.0426025390625, 0.0283203125, -0.04510498046875, -0.0032253265380859375, -0.0008211135864257812, -0.01702880859375, -0.0214080810546875, 0.0168609619140625, -0.047271728515625, -0.0013742446899414062, -0.02008056640625, -0.058349609375, -0.0615234375, -0.0015916824340820312, -0.0232696533203125, 0.002407073974609375, -0.00691986083984375, -0.037689208984375, -0.037353515625, -0.0202789306640625, 0.0167388916015625, 0.048828125, 0.0112457275390625, 0.0234222412109375, -0.02850341796875, -0.013580322265625, -0.0003399848937988281, 0.004444122314453125, 0.060211181640625, 0.0191802978515625, -0.027679443359375, -0.013458251953125, 0.0229034423828125, 0.0400390625, -0.0760498046875, -0.0286102294921875, -0.002315521240234375, 0.00927734375, -0.0309906005859375, 0.0694580078125, -0.020721435546875, -0.02093505859375, 0.032440185546875, 0.029998779296875, -0.00786590576171875, -0.01369476318359375, -0.030609130859375, 0.01493072509765625, -0.01529693603515625, -0.035980224609375, -0.0186309814453125, -0.00791168212890625, 0.01421356201171875, -0.022186279296875, 0.0003771781921386719, 0.00030517578125, -0.006847381591796875, -0.03515625, 0.0205841064453125, 0.0001227855682373047, 0.01445770263671875, 0.015777587890625, -0.007472991943359375, 0.008514404296875, -0.00833892822265625, 0.00982666015625, 0.0007653236389160156, 0.0225677490234375, 0.04583740234375, 0.028533935546875, -0.0209197998046875, 0.000583648681640625, -0.0018634796142578125, 0.033935546875, -0.03436279296875, -0.0274810791015625, -0.03363037109375, 0.040313720703125, -0.03369140625, -0.004131317138671875, 0.023162841796875, -0.0254364013671875, -0.0195770263671875, 0.0440673828125, 0.025146484375, 0.0162506103515625, 0.019317626953125, 0.0269012451171875, -0.0171051025390625, 0.0389404296875, -0.01076507568359375, -0.0025539398193359375, -0.039154052734375, -0.00566864013671875, -0.035858154296875, -0.0055084228515625, -0.0038166046142578125, 0.051513671875, -0.00506591796875, -0.041900634765625, 0.042694091796875, -0.01346588134765625, -0.153564453125, -0.0010309219360351562, 0.0029582977294921875, -0.0191497802734375, 0.034515380859375, -0.0123291015625, -0.033966064453125, -0.044952392578125, -0.0229949951171875, 0.054351806640625, -0.006046295166015625, -0.043731689453125, 0.02789306640625, -0.0210113525390625, 0.06256103515625, 0.0204315185546875, -0.022491455078125, -0.0182342529296875, 0.01465606689453125, -0.02984619140625, -0.0218353271484375, -0.02227783203125, 0.1064453125, 0.0105133056640625, -0.0293426513671875, -0.004505157470703125, 0.0297698974609375, -0.0284881591796875, -0.00020265579223632812, -0.001972198486328125, -0.0198516845703125, 0.0230255126953125, -0.0272369384765625, 0.028411865234375, 0.004131317138671875, 0.051727294921875, 0.0294189453125, 0.00713348388671875, -0.0151824951171875, 0.0017881393432617188, 0.0065460205078125, -0.005817413330078125, -0.007686614990234375, 0.036529541015625, -0.01076507568359375, -0.0413818359375, 0.022918701171875, 0.015045166015625, -0.0190277099609375, 0.0017719268798828125, -0.050201416015625, -0.034576416015625, -0.0310821533203125, 0.0272216796875, -0.044708251953125, 0.00909423828125, 0.00588226318359375, 0.04864501953125, -0.01910400390625, 0.02923583984375, -0.039703369140625, -0.027435302734375, 0.01509857177734375, 0.0379638671875, -0.007965087890625, -0.0291595458984375, 0.07073974609375, 0.01454925537109375, 0.034698486328125, -0.045623779296875, 0.028564453125, -0.008209228515625, 0.036865234375, -0.0115509033203125, 0.0308990478515625, 0.019561767578125, -0.029510498046875, -0.020599365234375, -0.020904541015625, -0.09600830078125, -0.0064849853515625, 0.0287628173828125, -0.033355712890625, 0.006832122802734375, -0.030029296875, 0.043365478515625, -0.007709503173828125, -0.0171966552734375, 0.04364013671875, 0.2548828125, 0.048675537109375, 0.037811279296875, -0.035064697265625, 0.03765869140625, -0.0389404296875, -0.0194091796875, -0.03485107421875, -0.04730224609375, -0.0440673828125, 0.0009341239929199219, -0.0235748291015625, -0.031768798828125, 0.01885986328125, -0.004547119140625, 0.0285797119140625, -0.0119476318359375, 0.0200958251953125, 0.0738525390625, -0.0310516357421875, 0.0030612945556640625, -0.0015783309936523438, 0.01512908935546875, -0.005725860595703125, -0.050506591796875, -0.02734375, 0.017669677734375, 0.02044677734375, -0.02783203125, 0.01312255859375, -0.0069732666015625, 0.011474609375, 0.0377197265625, 0.0333251953125, -0.01898193359375, -0.0265045166015625, 0.0194854736328125, -0.04736328125, -0.019775390625, 0.0389404296875, -0.0273284912109375, -0.006938934326171875, -0.0237579345703125, 0.03009033203125, -0.0106964111328125, -0.0447998046875, 0.0017900466918945312, -0.04705810546875, -0.0089874267578125, -0.016815185546875, 0.006923675537109375, -0.032073974609375, 0.0101165771484375, 0.003448486328125, 0.00333404541015625, 0.0168609619140625, -0.031158447265625, -0.027008056640625, 0.010467529296875, 0.043060302734375, 0.0132904052734375, 0.033599853515625, -0.01641845703125, 0.0037403106689453125, 0.00970458984375, -0.005977630615234375, 0.00305938720703125, -0.0252838134765625, 0.0478515625, 0.0292510986328125, 0.0166778564453125, 0.004344940185546875, 0.0088653564453125, -0.0164794921875, 0.01129913330078125, 0.00014412403106689453, 0.04034423828125, 0.0288238525390625, -0.01220703125, 0.036285400390625, -0.01284027099609375, -0.032501220703125, -0.02215576171875, -0.0027923583984375, 0.0213623046875, -0.0379638671875, 0.00705718994140625, 0.06085205078125, -0.01551055908203125, -0.007266998291015625, -0.001583099365234375, 0.015716552734375, -0.01447296142578125, -0.0022029876708984375, 0.010955810546875, -0.00713348388671875, -0.00556182861328125, -0.00531768798828125, -0.056793212890625, -0.02252197265625, -0.034027099609375, -0.0275726318359375, -0.0008749961853027344, -0.038360595703125, 0.0184326171875, -0.01108551025390625, 0.007843017578125, 0.001491546630859375, -0.0026798248291015625, -0.03826904296875, 0.006805419921875, 0.04443359375, -0.0145416259765625, -0.05010986328125, 0.00860595703125, 0.027679443359375, 0.018280029296875, 0.019989013671875, 0.0114593505859375, 0.0103759765625, -0.01253509521484375, -0.03204345703125, 0.002033233642578125, -0.00022292137145996094, -0.0252685546875, -0.004497528076171875, 0.004726409912109375, 0.041046142578125, 0.037628173828125, 0.039642333984375, 0.016448974609375, -0.03466796875, -0.0036983489990234375, 0.015045166015625, 0.0909423828125, 0.029022216796875, 0.04486083984375, 0.01490020751953125, -0.004390716552734375, 0.04718017578125, -0.0078277587890625, -0.007404327392578125, -0.019073486328125, 0.00113677978515625, -0.022857666015625, 0.0264434814453125, -0.05035400390625, -0.005580902099609375, 0.011993408203125, 0.053436279296875, 0.054290771484375, -0.0009374618530273438, 0.041839599609375, -0.049713134765625, -0.005146026611328125, -0.051727294921875, -0.02960205078125, 0.0243988037109375, 0.01776123046875, -0.0377197265625, 0.001094818115234375, -0.00565338134765625, 0.0268402099609375, 0.046600341796875, -0.01715087890625, 0.0177154541015625, -0.004241943359375, 0.0057220458984375, -0.0167388916015625, -0.0090484619140625, -0.02264404296875, -0.02197265625, -0.06494140625, 0.06280517578125, -0.0193939208984375, -0.0229339599609375, -0.01474761962890625, -0.056488037109375, -0.048675537109375, 0.0277252197265625, 0.006862640380859375, -0.0308074951171875, -0.041046142578125, 0.040435791015625, 0.0188140869140625, 0.04583740234375, 0.0106353759765625, -0.01019287109375, 0.025665283203125, -0.0166778564453125, -0.0545654296875, 0.10260009765625, -0.06243896484375, -0.0188140869140625, 0.0104217529296875, -0.017913818359375, 0.0304412841796875, -0.032867431640625, -0.00908660888671875, -0.055023193359375, 0.0281829833984375, -0.03289794921875, 0.006229400634765625, -0.011871337890625, -0.00748443603515625, -0.052032470703125, -0.0006456375122070312, 0.018218994140625, -0.0195770263671875, -0.0301971435546875, 0.009735107421875, 0.013824462890625, -0.044342041015625, 0.0110015869140625, -0.0155792236328125, -0.037689208984375, 0.023101806640625, 0.0312347412109375, -0.059722900390625, -0.045562744140625, -0.005649566650390625, -0.0170745849609375, 0.043975830078125, -0.04736328125, -0.023590087890625, 0.0009617805480957031, -0.00260162353515625, -0.001667022705078125, -0.0233306884765625, 0.032257080078125, 0.006374359130859375, 0.0011415481567382812, -0.0203857421875, -0.04034423828125, 0.0390625, 0.010467529296875, 0.0279693603515625, 0.042510986328125, -0.003803253173828125, -0.01383209228515625, 0.01248931884765625, 0.002227783203125, 0.01739501953125, 0.09893798828125, 0.0006337165832519531, 0.02044677734375, -0.0322265625, -0.01012420654296875, 0.045318603515625, 0.00725555419921875, -0.0185394287109375, -0.0330810546875, 0.008758544921875, -0.0304412841796875, 0.0091400146484375, -0.005962371826171875, -0.00922393798828125, 0.0032196044921875, -0.04296875, 0.00044655799865722656, 0.0227813720703125, 0.0124053955078125, -0.02996826171875, 0.0093841552734375, 0.010986328125, -0.043365478515625, 0.03228759765625, -0.0390625, -0.021270751953125, -0.0469970703125, -0.048828125, 0.0191650390625, -0.050048828125, -0.047515869140625, 0.1080322265625, -0.005290985107421875, -0.004253387451171875, -0.01953125, 0.032318115234375, 0.0178985595703125, -0.06060791015625, 0.0279693603515625, 0.018341064453125, -0.003204345703125, -0.017364501953125, -0.01776123046875, 0.01123809814453125, -0.008880615234375, 0.00287628173828125, -0.0650634765625, 0.038604736328125, 0.00836181640625, -0.0389404296875, 0.0100860595703125, -0.0262603759765625, 0.0173492431640625, -0.03057861328125, 0.0352783203125, -0.01275634765625, 0.031524658203125, 0.00054168701171875, -0.037200927734375, -0.018096923828125, -0.0164031982421875, 0.0171356201171875, -0.013916015625, -0.003261566162109375, -0.03192138671875, 0.052337646484375, -0.0216522216796875, 0.0159759521484375, -0.05596923828125, -0.00922393798828125, 0.01166534423828125, -0.005191802978515625, -0.00829315185546875, 0.037933349609375, 0.0031223297119140625, 0.0249176025390625, -2.4318695068359375e-05, -0.01558685302734375, 0.0009374618530273438, -0.00954437255859375, 0.03204345703125, -0.034149169921875, 0.02850341796875, 0.04083251953125, 0.0087890625, -0.03466796875, -0.04022216796875, -0.0084686279296875, -0.0162353515625, -0.0087127685546875, 0.0098419189453125, 0.0301513671875, 0.013275146484375, 0.0227203369140625, -0.0184478759765625, 0.029449462890625, -0.032318115234375, 0.0185394287109375, 0.025054931640625, 0.07537841796875, -0.042938232421875, 0.043853759765625, -0.0379638671875, -0.01898193359375, 0.039825439453125, 0.025634765625, -0.0146942138671875, -0.0269012451171875, 0.01444244384765625, -0.05419921875, -0.050933837890625, 0.002288818359375, -0.00027370452880859375, -0.01157379150390625, 0.0391845703125, 0.0249176025390625, 0.00981903076171875, -0.0149993896484375, -0.0030651092529296875, -0.044036865234375, 0.038726806640625, -0.0160980224609375, 0.03338623046875, -0.0168609619140625, -0.0596923828125, 0.015960693359375, -0.030517578125, -0.0316162109375, -0.02032470703125, 0.046356201171875, 0.0380859375, 0.0008301734924316406, 0.0237579345703125, -0.03387451171875, 0.00870513916015625, 0.03204345703125, -0.038482666015625, -0.006664276123046875, -0.0357666015625, 0.004993438720703125, -0.045806884765625, 0.024139404296875, -0.032196044921875, -0.0285186767578125, 0.011199951171875, -0.036285400390625, 0.05181884765625, -0.0183868408203125, 0.033233642578125, 0.0165252685546875, 0.0023746490478515625, -0.1925048828125, -0.0023097991943359375, -0.031494140625, 0.03955078125, -0.03204345703125, 0.0003898143768310547, 0.00664520263671875, -0.041290283203125, 0.018280029296875, -0.0321044921875, -0.04461669921875, 0.043731689453125, 0.0279083251953125, -0.01465606689453125, -0.02691650390625, 0.0264129638671875, 0.00878143310546875, 0.0113067626953125, -0.045135498046875, 0.04388427734375, 0.0009450912475585938, 0.0004353523254394531, 0.038665771484375, -0.0018224716186523438, -0.003841400146484375, -0.0163421630859375, 0.01910400390625, 0.03631591796875, -0.0322265625, 0.00934600830078125, 0.03515625, -0.0273284912109375, 0.01496124267578125, 0.011505126953125, 0.00800323486328125, 0.0142669677734375, 0.00777435302734375, -0.034210205078125, 0.0240020751953125, 0.04779052734375, -0.0158233642578125, 0.023193359375, -0.0079193115234375, 0.0101776123046875, 0.03570556640625, 0.034393310546875, -0.00751495361328125, -0.0267181396484375, -0.04718017578125, 0.0233612060546875, 0.006298065185546875, -0.0296173095703125, -0.035797119140625, 0.0562744140625, -0.015411376953125, 0.003803253173828125, -0.033203125, 0.007244110107421875, -0.01445770263671875, -0.0003407001495361328, -0.0090484619140625, 0.0196075439453125, -0.02752685546875, -0.047332763671875, -0.03546142578125, -0.006809234619140625, -0.06976318359375, 0.020233154296875, -0.0286865234375, 0.04669189453125, -0.006473541259765625, 0.0081329345703125, -0.0108795166015625, -0.0662841796875, 0.01155853271484375, 0.057891845703125, 0.030029296875, 0.01029205322265625, -0.00034928321838378906, -0.04595947265625, -0.02532958984375, -0.008819580078125, -0.0029582977294921875, 0.062103271484375, 0.043792724609375, 0.0262603759765625, -0.0335693359375, 0.0115509033203125, -0.07049560546875, -0.00984954833984375, -0.036590576171875, -0.018341064453125, 0.000606536865234375, 0.039398193359375, -0.0516357421875, 0.04638671875, 0.0089569091796875, 0.004497528076171875, 0.0036258697509765625, -0.01409912109375, -0.048980712890625, 0.00235748291015625, -0.0206451416015625, 0.0186614990234375, -0.0209197998046875, 0.034454345703125, -0.004146575927734375, -0.0006818771362304688, 0.052886962890625, -0.004444122314453125, -0.01378631591796875, -0.005229949951171875, 0.007015228271484375, 0.0266876220703125, -0.08123779296875, -0.00514984130859375, 0.0004553794860839844, 0.005340576171875, 0.0045928955078125, 0.02783203125, 0.00745391845703125, -0.0163726806640625, 0.0157623291015625, -0.034271240234375, 0.041046142578125, -0.00717926025390625, 0.028533935546875, 0.0318603515625, 0.011260986328125, -0.00772857666015625, 0.047210693359375, -0.032135009765625, 0.0224456787109375, 0.0034160614013671875, 0.013580322265625, 0.013092041015625, -0.031463623046875, 0.042205810546875, -0.00039505958557128906, 0.0088653564453125, -0.0055389404296875, -0.0294952392578125, -0.0232696533203125, -0.022430419921875, -0.003387451171875, 0.0345458984375, -0.030426025390625, -0.026885986328125, -0.0031795501708984375, -0.01387786865234375, 0.00882720947265625, 0.005035400390625, 0.00479888916015625, 0.0302276611328125, 0.006763458251953125, 0.029022216796875, 0.0133209228515625, 0.00919342041015625, -0.01219940185546875, 0.0214080810546875, 0.00079345703125, -0.0122833251953125, 0.006122589111328125, -0.01395416259765625, -0.0203094482421875, -0.006893157958984375, -0.00640869140625, 0.00235748291015625, -0.01187896728515625, 0.027008056640625, 0.03338623046875, 0.0096282958984375, 0.00612640380859375, -0.0374755859375, 0.0161590576171875, -0.005767822265625, -0.00904083251953125, 0.002635955810546875, 0.0016889572143554688, -0.003986358642578125, 0.0194854736328125, 0.0289459228515625, 0.05584716796875, 0.03460693359375, 0.0137176513671875, 0.05279541015625, -0.017730712890625, 0.0124664306640625, 0.01470184326171875, 0.032562255859375, 0.01529693603515625, -0.030242919921875, -0.005130767822265625, 0.007389068603515625, 0.036224365234375, 0.050872802734375, 0.0163421630859375, 0.015960693359375, -0.009765625, 0.0496826171875, 0.006938934326171875, -0.06048583984375, 0.007122039794921875, -0.037628173828125, -0.03759765625, -0.0223236083984375, 0.0034923553466796875, -0.002716064453125, -0.01369476318359375, 0.030517578125, -0.003887176513671875, 0.01212310791015625, -0.0194091796875, 0.025970458984375, 0.00400543212890625, -0.0258331298828125, 0.01568603515625, 0.0018177032470703125, 0.01119232177734375, 0.0014934539794921875, -0.03558349609375, 0.0390625, -0.0008082389831542969, -0.01441192626953125, -0.0163421630859375, 0.015655517578125, -0.006591796875, -0.0012502670288085938, 0.025604248046875, -0.010528564453125, -0.0071258544921875, 0.038482666015625, 0.00301361083984375, 0.056854248046875, 0.0159454345703125, 0.017333984375, 0.02099609375, 0.043792724609375, -0.0082244873046875, -0.07501220703125, 0.0628662109375, 0.0004334449768066406, -0.033966064453125, 0.017669677734375]

Using cURL:

curl -X 'POST' -k 'http://xx.0.139.5:4000/embeddings'  -H 'Ocp-Apim-Subscription-Key: fkFihkMb8CDA02vkt7Yr'  -H 'accept: application/json'  -H 'Content-Type: application/json'  -d '{  "model": "BAAI/bge-m3", "input":"This is an example text for embedding."}'

{"model":"BAAI/bge-m3","data":[{"index":0,"object":"embedding","embedding":[-0.044189453125,0.0220184326171875,-0.0206146240234375,0.00872802734375,-0.013885498046875,-0.0704345703125,0.0809326171875,0.033843994140625,0.005382537841796875,0.0191650390625,0.005558013916015625,-0.0150909423828125,0.006252288818359375,-0.01493072509765625,-0.0015430450439453125,-0.02874755859375,0.037017822265625,-0.01390838623046875,-0.006366729736328125,-0.0174102783203125,-0.039886474609375,-0.0139007568359375,0.0159912109375,-0.00714874267578125,-0.0169677734375,0.03289794921875,0.0211944580078125,-0.00640106201171875,-0.004283905029296875,-0.01470184326171875,0.03271484375,0.019439697265625,0.03228759765625,-0.036529541015625,0.015777587890625,-0.01111602783203125,0.013031005859375,-0.004108428955078125,-0.0455322265625,0.004856109619140625,-0.0024166107177734375,0.0206146240234375,0.005649566650390625,-0.050384521484375,0.0294952392578125,-0.017303466796875,0.00371551513671875,-0.0269775390625,-0.04510498046875,0.01465606689453125,-0.02691650390625,0.0162353515625,0.046234130859375,0.00984954833984375,-0.00640869140625,-0.00681304931640625,-0.035003662109375,-0.00595855712890625,-0.0677490234375,0.012298583984375,-0.04803466796875,0.0118865966796875,-0.0191650390625,0.0092926025390625,0.009246826171875,0.06884765625,0.01351165771484375,0.007965087890625,-0.04248046875,-0.032867431640625,-0.010711669921875,0.048828125,-0.05029296875,-0.0149078369140625,-0.0662841796875,0.04010009765625,-0.01490020751953125,-0.034698486328125,-0.027008056640625,0.055023193359375,0.02593994140625,-0.0128173828125,0.0159759521484375,0.00600433349609375,-0.05010986328125,0.029754638671875,-0.05029296875,0.03155517578125,0.0006766319274902344,-0.00939178466796875,-0.0128021240234375,-0.02008056640625,0.032623291015625,-0.026031494140625,-0.016815185546875,0.0158233642578125,0.00876617431640625,0.01898193359375,0.029815673828125,0.0246734619140625,-0.02105712890625,0.00916290283203125,0.0030384063720703125,0.0033111572265625,0.00839996337890625,-0.0257415771484375,0.04888916015625,0.0355224609375,0.005954742431640625,-0.0170135498046875,0.01727294921875,0.045684814453125,0.032135009765625,-0.0055694580078125,0.002716064453125,-0.019561767578125,-0.040008544921875,-0.006061553955078125,-0.0234222412109375,-0.025115966796875,0.022491455078125,0.05877685546875,0.047454833984375,-0.051971435546875,0.05010986328125,-0.0276336669921875,-0.00875091552734375,0.0175018310546875,0.00787353515625,-0.0231475830078125,-0.01348876953125,0.0291748046875,-0.017608642578125,-0.0022182464599609375,-0.0264434814453125,-0.01934814453125,0.016845703125,0.033538818359375,0.05133056640625,-0.0516357421875,0.056365966796875,-0.00995635986328125,-0.0239715576171875,-0.053314208984375,0.049285888671875,-0.060638427734375,0.0095977783203125,0.0263671875,0.027862548828125,-0.01091766357421875,-0.0565185546875,0.019073486328125,-0.04840087890625,-0.00319671630859375,0.00969696044921875,-0.045989990234375,0.00470733642578125,0.0158233642578125,0.033935546875,0.050872802734375,-0.057464599609375,-0.00223541259765625,-0.042724609375,-0.01125335693359375,-0.00481414794921875,0.01267242431640625,0.03912353515625,0.043609619140625,-0.0162353515625,-0.046295166015625,-0.0156707763671875,0.005771636962890625,0.01702880859375,0.0148468017578125,0.00926971435546875,0.02435302734375,0.060791015625,0.0127410888671875,0.003871917724609375,-0.034332275390625,-0.01100921630859375,-0.0055084228515625,-0.01076507568359375,-0.0131378173828125,-0.0199737548828125,0.020721435546875,0.00505828857421875,-0.013641357421875,-0.00861358642578125,0.020233154296875,-0.02349853515625,-0.035797119140625,0.009429931640625,0.00803375244140625,0.0031681060791015625,0.0426025390625,0.0283203125,-0.04510498046875,-0.0032253265380859375,-0.0008211135864257812,-0.01702880859375,-0.0214080810546875,0.0168609619140625,-0.047271728515625,-0.0013742446899414062,-0.02008056640625,-0.058349609375,-0.0615234375,-0.0015916824340820312,-0.0232696533203125,0.002407073974609375,-0.00691986083984375,-0.037689208984375,-0.037353515625,-0.0202789306640625,0.0167388916015625,0.048828125,0.0112457275390625,0.0234222412109375,-0.02850341796875,-0.013580322265625,-0.0003399848937988281,0.004444122314453125,0.060211181640625,0.0191802978515625,-0.027679443359375,-0.013458251953125,0.0229034423828125,0.0400390625,-0.0760498046875,-0.0286102294921875,-0.002315521240234375,0.00927734375,-0.0309906005859375,0.0694580078125,-0.020721435546875,-0.02093505859375,0.032440185546875,0.029998779296875,-0.00786590576171875,-0.01369476318359375,-0.030609130859375,0.01493072509765625,-0.01529693603515625,-0.035980224609375,-0.0186309814453125,-0.00791168212890625,0.01421356201171875,-0.022186279296875,0.0003771781921386719,0.00030517578125,-0.006847381591796875,-0.03515625,0.0205841064453125,0.0001227855682373047,0.01445770263671875,0.015777587890625,-0.007472991943359375,0.008514404296875,-0.00833892822265625,0.00982666015625,0.0007653236389160156,0.0225677490234375,0.04583740234375,0.028533935546875,-0.0209197998046875,0.000583648681640625,-0.0018634796142578125,0.033935546875,-0.03436279296875,-0.0274810791015625,-0.03363037109375,0.040313720703125,-0.03369140625,-0.004131317138671875,0.023162841796875,-0.0254364013671875,-0.0195770263671875,0.0440673828125,0.025146484375,0.0162506103515625,0.019317626953125,0.0269012451171875,-0.0171051025390625,0.0389404296875,-0.01076507568359375,-0.0025539398193359375,-0.039154052734375,-0.00566864013671875,-0.035858154296875,-0.0055084228515625,-0.0038166046142578125,0.051513671875,-0.00506591796875,-0.041900634765625,0.042694091796875,-0.01346588134765625,-0.153564453125,-0.0010309219360351562,0.0029582977294921875,-0.0191497802734375,0.034515380859375,-0.0123291015625,-0.033966064453125,-0.044952392578125,-0.0229949951171875,0.054351806640625,-0.006046295166015625,-0.043731689453125,0.02789306640625,-0.0210113525390625,0.06256103515625,0.0204315185546875,-0.022491455078125,-0.0182342529296875,0.01465606689453125,-0.02984619140625,-0.0218353271484375,-0.02227783203125,0.1064453125,0.0105133056640625,-0.0293426513671875,-0.004505157470703125,0.0297698974609375,-0.0284881591796875,-0.00020265579223632812,-0.001972198486328125,-0.0198516845703125,0.0230255126953125,-0.0272369384765625,0.028411865234375,0.004131317138671875,0.051727294921875,0.0294189453125,0.00713348388671875,-0.0151824951171875,0.0017881393432617188,0.0065460205078125,-0.005817413330078125,-0.007686614990234375,0.036529541015625,-0.01076507568359375,-0.0413818359375,0.022918701171875,0.015045166015625,-0.0190277099609375,0.0017719268798828125,-0.050201416015625,-0.034576416015625,-0.0310821533203125,0.0272216796875,-0.044708251953125,0.00909423828125,0.00588226318359375,0.04864501953125,-0.01910400390625,0.02923583984375,-0.039703369140625,-0.027435302734375,0.01509857177734375,0.0379638671875,-0.007965087890625,-0.0291595458984375,0.07073974609375,0.01454925537109375,0.034698486328125,-0.045623779296875,0.028564453125,-0.008209228515625,0.036865234375,-0.0115509033203125,0.0308990478515625,0.019561767578125,-0.029510498046875,-0.020599365234375,-0.020904541015625,-0.09600830078125,-0.0064849853515625,0.0287628173828125,-0.033355712890625,0.006832122802734375,-0.030029296875,0.043365478515625,-0.007709503173828125,-0.0171966552734375,0.04364013671875,0.2548828125,0.048675537109375,0.037811279296875,-0.035064697265625,0.03765869140625,-0.0389404296875,-0.0194091796875,-0.03485107421875,-0.04730224609375,-0.0440673828125,0.0009341239929199219,-0.0235748291015625,-0.031768798828125,0.01885986328125,-0.004547119140625,0.0285797119140625,-0.0119476318359375,0.0200958251953125,0.0738525390625,-0.0310516357421875,0.0030612945556640625,-0.0015783309936523438,0.01512908935546875,-0.005725860595703125,-0.050506591796875,-0.02734375,0.017669677734375,0.02044677734375,-0.02783203125,0.01312255859375,-0.0069732666015625,0.011474609375,0.0377197265625,0.0333251953125,-0.01898193359375,-0.0265045166015625,0.0194854736328125,-0.04736328125,-0.019775390625,0.0389404296875,-0.0273284912109375,-0.006938934326171875,-0.0237579345703125,0.03009033203125,-0.0106964111328125,-0.0447998046875,0.0017900466918945312,-0.04705810546875,-0.0089874267578125,-0.016815185546875,0.006923675537109375,-0.032073974609375,0.0101165771484375,0.003448486328125,0.00333404541015625,0.0168609619140625,-0.031158447265625,-0.027008056640625,0.010467529296875,0.043060302734375,0.0132904052734375,0.033599853515625,-0.01641845703125,0.0037403106689453125,0.00970458984375,-0.005977630615234375,0.00305938720703125,-0.0252838134765625,0.0478515625,0.0292510986328125,0.0166778564453125,0.004344940185546875,0.0088653564453125,-0.0164794921875,0.01129913330078125,0.00014412403106689453,0.04034423828125,0.0288238525390625,-0.01220703125,0.036285400390625,-0.01284027099609375,-0.032501220703125,-0.02215576171875,-0.0027923583984375,0.0213623046875,-0.0379638671875,0.00705718994140625,0.06085205078125,-0.01551055908203125,-0.007266998291015625,-0.001583099365234375,0.015716552734375,-0.01447296142578125,-0.0022029876708984375,0.010955810546875,-0.00713348388671875,-0.00556182861328125,-0.00531768798828125,-0.056793212890625,-0.02252197265625,-0.034027099609375,-0.0275726318359375,-0.0008749961853027344,-0.038360595703125,0.0184326171875,-0.01108551025390625,0.007843017578125,0.001491546630859375,-0.0026798248291015625,-0.03826904296875,0.006805419921875,0.04443359375,-0.0145416259765625,-0.05010986328125,0.00860595703125,0.027679443359375,0.018280029296875,0.019989013671875,0.0114593505859375,0.0103759765625,-0.01253509521484375,-0.03204345703125,0.002033233642578125,-0.00022292137145996094,-0.0252685546875,-0.004497528076171875,0.004726409912109375,0.041046142578125,0.037628173828125,0.039642333984375,0.016448974609375,-0.03466796875,-0.0036983489990234375,0.015045166015625,0.0909423828125,0.029022216796875,0.04486083984375,0.01490020751953125,-0.004390716552734375,0.04718017578125,-0.0078277587890625,-0.007404327392578125,-0.019073486328125,0.00113677978515625,-0.022857666015625,0.0264434814453125,-0.05035400390625,-0.005580902099609375,0.011993408203125,0.053436279296875,0.054290771484375,-0.0009374618530273438,0.041839599609375,-0.049713134765625,-0.005146026611328125,-0.051727294921875,-0.02960205078125,0.0243988037109375,0.01776123046875,-0.0377197265625,0.001094818115234375,-0.00565338134765625,0.0268402099609375,0.046600341796875,-0.01715087890625,0.0177154541015625,-0.004241943359375,0.0057220458984375,-0.0167388916015625,-0.0090484619140625,-0.02264404296875,-0.02197265625,-0.06494140625,0.06280517578125,-0.0193939208984375,-0.0229339599609375,-0.01474761962890625,-0.056488037109375,-0.048675537109375,0.0277252197265625,0.006862640380859375,-0.0308074951171875,-0.041046142578125,0.040435791015625,0.0188140869140625,0.04583740234375,0.0106353759765625,-0.01019287109375,0.025665283203125,-0.0166778564453125,-0.0545654296875,0.10260009765625,-0.06243896484375,-0.0188140869140625,0.0104217529296875,-0.017913818359375,0.0304412841796875,-0.032867431640625,-0.00908660888671875,-0.055023193359375,0.0281829833984375,-0.03289794921875,0.006229400634765625,-0.011871337890625,-0.00748443603515625,-0.052032470703125,-0.0006456375122070312,0.018218994140625,-0.0195770263671875,-0.0301971435546875,0.009735107421875,0.013824462890625,-0.044342041015625,0.0110015869140625,-0.0155792236328125,-0.037689208984375,0.023101806640625,0.0312347412109375,-0.059722900390625,-0.045562744140625,-0.005649566650390625,-0.0170745849609375,0.043975830078125,-0.04736328125,-0.023590087890625,0.0009617805480957031,-0.00260162353515625,-0.001667022705078125,-0.0233306884765625,0.032257080078125,0.006374359130859375,0.0011415481567382812,-0.0203857421875,-0.04034423828125,0.0390625,0.010467529296875,0.0279693603515625,0.042510986328125,-0.003803253173828125,-0.01383209228515625,0.01248931884765625,0.002227783203125,0.01739501953125,0.09893798828125,0.0006337165832519531,0.02044677734375,-0.0322265625,-0.01012420654296875,0.045318603515625,0.00725555419921875,-0.0185394287109375,-0.0330810546875,0.008758544921875,-0.0304412841796875,0.0091400146484375,-0.005962371826171875,-0.00922393798828125,0.0032196044921875,-0.04296875,0.00044655799865722656,0.0227813720703125,0.0124053955078125,-0.02996826171875,0.0093841552734375,0.010986328125,-0.043365478515625,0.03228759765625,-0.0390625,-0.021270751953125,-0.0469970703125,-0.048828125,0.0191650390625,-0.050048828125,-0.047515869140625,0.1080322265625,-0.005290985107421875,-0.004253387451171875,-0.01953125,0.032318115234375,0.0178985595703125,-0.06060791015625,0.0279693603515625,0.018341064453125,-0.003204345703125,-0.017364501953125,-0.01776123046875,0.01123809814453125,-0.008880615234375,0.00287628173828125,-0.0650634765625,0.038604736328125,0.00836181640625,-0.0389404296875,0.0100860595703125,-0.0262603759765625,0.0173492431640625,-0.03057861328125,0.0352783203125,-0.01275634765625,0.031524658203125,0.00054168701171875,-0.037200927734375,-0.018096923828125,-0.0164031982421875,0.0171356201171875,-0.013916015625,-0.003261566162109375,-0.03192138671875,0.052337646484375,-0.0216522216796875,0.0159759521484375,-0.05596923828125,-0.00922393798828125,0.01166534423828125,-0.005191802978515625,-0.00829315185546875,0.037933349609375,0.0031223297119140625,0.0249176025390625,-0.000024318695068359375,-0.01558685302734375,0.0009374618530273438,-0.00954437255859375,0.03204345703125,-0.034149169921875,0.02850341796875,0.04083251953125,0.0087890625,-0.03466796875,-0.04022216796875,-0.0084686279296875,-0.0162353515625,-0.0087127685546875,0.0098419189453125,0.0301513671875,0.013275146484375,0.0227203369140625,-0.0184478759765625,0.029449462890625,-0.032318115234375,0.0185394287109375,0.025054931640625,0.07537841796875,-0.042938232421875,0.043853759765625,-0.0379638671875,-0.01898193359375,0.039825439453125,0.025634765625,-0.0146942138671875,-0.0269012451171875,0.01444244384765625,-0.05419921875,-0.050933837890625,0.002288818359375,-0.00027370452880859375,-0.01157379150390625,0.0391845703125,0.0249176025390625,0.00981903076171875,-0.0149993896484375,-0.0030651092529296875,-0.044036865234375,0.038726806640625,-0.0160980224609375,0.03338623046875,-0.0168609619140625,-0.0596923828125,0.015960693359375,-0.030517578125,-0.0316162109375,-0.02032470703125,0.046356201171875,0.0380859375,0.0008301734924316406,0.0237579345703125,-0.03387451171875,0.00870513916015625,0.03204345703125,-0.038482666015625,-0.006664276123046875,-0.0357666015625,0.004993438720703125,-0.045806884765625,0.024139404296875,-0.032196044921875,-0.0285186767578125,0.011199951171875,-0.036285400390625,0.05181884765625,-0.0183868408203125,0.033233642578125,0.0165252685546875,0.0023746490478515625,-0.1925048828125,-0.0023097991943359375,-0.031494140625,0.03955078125,-0.03204345703125,0.0003898143768310547,0.00664520263671875,-0.041290283203125,0.018280029296875,-0.0321044921875,-0.04461669921875,0.043731689453125,0.0279083251953125,-0.01465606689453125,-0.02691650390625,0.0264129638671875,0.00878143310546875,0.0113067626953125,-0.045135498046875,0.04388427734375,0.0009450912475585938,0.0004353523254394531,0.038665771484375,-0.0018224716186523438,-0.003841400146484375,-0.0163421630859375,0.01910400390625,0.03631591796875,-0.0322265625,0.00934600830078125,0.03515625,-0.0273284912109375,0.01496124267578125,0.011505126953125,0.00800323486328125,0.0142669677734375,0.00777435302734375,-0.034210205078125,0.0240020751953125,0.04779052734375,-0.0158233642578125,0.023193359375,-0.0079193115234375,0.0101776123046875,0.03570556640625,0.034393310546875,-0.00751495361328125,-0.0267181396484375,-0.04718017578125,0.0233612060546875,0.006298065185546875,-0.0296173095703125,-0.035797119140625,0.0562744140625,-0.015411376953125,0.003803253173828125,-0.033203125,0.007244110107421875,-0.01445770263671875,-0.0003407001495361328,-0.0090484619140625,0.0196075439453125,-0.02752685546875,-0.047332763671875,-0.03546142578125,-0.006809234619140625,-0.06976318359375,0.020233154296875,-0.0286865234375,0.04669189453125,-0.006473541259765625,0.0081329345703125,-0.0108795166015625,-0.0662841796875,0.01155853271484375,0.057891845703125,0.030029296875,0.01029205322265625,-0.00034928321838378906,-0.04595947265625,-0.02532958984375,-0.008819580078125,-0.0029582977294921875,0.062103271484375,0.043792724609375,0.0262603759765625,-0.0335693359375,0.0115509033203125,-0.07049560546875,-0.00984954833984375,-0.036590576171875,-0.018341064453125,0.000606536865234375,0.039398193359375,-0.0516357421875,0.04638671875,0.0089569091796875,0.004497528076171875,0.0036258697509765625,-0.01409912109375,-0.048980712890625,0.00235748291015625,-0.0206451416015625,0.0186614990234375,-0.0209197998046875,0.034454345703125,-0.004146575927734375,-0.0006818771362304688,0.052886962890625,-0.004444122314453125,-0.01378631591796875,-0.005229949951171875,0.007015228271484375,0.0266876220703125,-0.08123779296875,-0.00514984130859375,0.0004553794860839844,0.005340576171875,0.0045928955078125,0.02783203125,0.00745391845703125,-0.0163726806640625,0.0157623291015625,-0.034271240234375,0.041046142578125,-0.00717926025390625,0.028533935546875,0.0318603515625,0.011260986328125,-0.00772857666015625,0.047210693359375,-0.032135009765625,0.0224456787109375,0.0034160614013671875,0.013580322265625,0.013092041015625,-0.031463623046875,0.042205810546875,-0.00039505958557128906,0.0088653564453125,-0.0055389404296875,-0.0294952392578125,-0.0232696533203125,-0.022430419921875,-0.003387451171875,0.0345458984375,-0.030426025390625,-0.026885986328125,-0.0031795501708984375,-0.01387786865234375,0.00882720947265625,0.005035400390625,0.00479888916015625,0.0302276611328125,0.006763458251953125,0.029022216796875,0.0133209228515625,0.00919342041015625,-0.01219940185546875,0.0214080810546875,0.00079345703125,-0.0122833251953125,0.006122589111328125,-0.01395416259765625,-0.0203094482421875,-0.006893157958984375,-0.00640869140625,0.00235748291015625,-0.01187896728515625,0.027008056640625,0.03338623046875,0.0096282958984375,0.00612640380859375,-0.0374755859375,0.0161590576171875,-0.005767822265625,-0.00904083251953125,0.002635955810546875,0.0016889572143554688,-0.003986358642578125,0.0194854736328125,0.0289459228515625,0.05584716796875,0.03460693359375,0.0137176513671875,0.05279541015625,-0.017730712890625,0.0124664306640625,0.01470184326171875,0.032562255859375,0.01529693603515625,-0.030242919921875,-0.005130767822265625,0.007389068603515625,0.036224365234375,0.050872802734375,0.0163421630859375,0.015960693359375,-0.009765625,0.0496826171875,0.006938934326171875,-0.06048583984375,0.007122039794921875,-0.037628173828125,-0.03759765625,-0.0223236083984375,0.0034923553466796875,-0.002716064453125,-0.01369476318359375,0.030517578125,-0.003887176513671875,0.01212310791015625,-0.0194091796875,0.025970458984375,0.00400543212890625,-0.0258331298828125,0.01568603515625,0.0018177032470703125,0.01119232177734375,0.0014934539794921875,-0.03558349609375,0.0390625,-0.0008082389831542969,-0.01441192626953125,-0.0163421630859375,0.015655517578125,-0.006591796875,-0.0012502670288085938,0.025604248046875,-0.010528564453125,-0.0071258544921875,0.038482666015625,0.00301361083984375,0.056854248046875,0.0159454345703125,0.017333984375,0.02099609375,0.043792724609375,-0.0082244873046875,-0.07501220703125,0.0628662109375,0.0004334449768066406,-0.033966064453125,0.017669677734375]}],"object":"list","usage":{"completion_tokens":0,"prompt_tokens":12,"total_tokens":12,"completion_tokens_details":null,"prompt_tokens_details":null}}

Conclusion:
As the landscape of LLMs continues to grow, tools like LiteLLM are essential for streamlining inference and orchestration. Its OpenAI-compatible API, built-in fallback support, and vendor flexibility position it as a robust proxy layer for modern AI applications.

You can find all code samples and scripts here: https://github.com/satendrakumar/LiteLLM-deployment

AIML, AWS, GPU, Hugging Face, Triton Inference Server

Deploying Hugging Face model on NVIDIA Triton inference Server

In this blog, I will deploy Hugging face Model on the NVIDIA Triton inference Server.

Prerequisites:

AWS EC2 p3.2xlarge instance
Docker
Conda

I am using AWS EC2 p3.2xlarge(GPU instance) for the demonstration so please create the AWS node. It requires the NVIDIA driver and containers library.

Install the NVIDIA driver:

sudo apt-get install linux-headers-$(uname -r)

distribution=$(. /etc/os-release;echo $ID$VERSION_ID | sed -e 's/\.//g')

wget https://developer.download.nvidia.com/compute/cuda/repos/$distribution/x86_64/cuda-keyring_1.0-1_all.deb

sudo dpkg -i cuda-keyring_1.0-1_all.deb

sudo apt update

sudo apt-get -y install cuda-drivers

Validate the GPU driver:

$ nvidia-smi

Here is the Output:

Install the container library:

distribution=$(. /etc/os-release;echo $ID$VERSION_ID)

curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -

curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list

sudo apt-get update

sudo apt-get install nvidia-container-toolkit

sudo systemctl restart docker

Triton Server supports TensorRT Models, ONNX Models, TorchScript Models, TensorFlow Models, OpenVINO Models, Python Models, and DALI Models. I will use Python Models for Hugging Face deployment. Triton server has a standard directory structure for each model type. Here is Python model directory structure:

$ tree model_repository/ -I '__pycache__'
model_repository/    #  ROOT FOLDER(may have many models)
└── sentiment          # MODEL FOLDER NAME(same as model name)
    ├── 1                        # MODEL VERSION
    │   └── model.py   # MODEL PYTHON SCRIPT 
    ├── config.pbtxt   # CONFIG FILE FOR A MODEL
    └── hf-sentiment.tar.gz # CONDA ENV(all dependencies required for hugging face)

2 directories, 3 files

Create an empty directory structure as described above and let’s understand each file one by one.

config.pbtxt It is the config for a model that describes the model name, backend, Input/Output fields and types, and model execution information like GPU or CPU, Batch size, and many more. I will take minimal configuration.

name: "sentiment"
backend: "python"
input [
  {
    name: "text"
    data_type: TYPE_STRING
    dims: [-1]
  }
]
output [
  {
    name: "sentiment"
    data_type: TYPE_STRING
    dims: [-1]
  }
]

parameters: {
  key: "EXECUTION_ENV_PATH",
  value: {string_value: "/mnt/model_repository/sentiment/hf-sentiment.tar.gz"}
}

instance_group [
  {
    kind: KIND_GPU
  }
]

2. model.py has standard class(TritonPythonModel) with 3 methods(needs to implement).

Note: I am using the Hugging Face sentiment model called “cardiffnlp/twitter-roberta-base-sentiment-latest“

import triton_python_backend_utils as pb_utils
import numpy as np
from transformers import pipeline

class TritonPythonModel:
    def initialize(self, args):
        model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
        self.generator = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)

    def execute(self, requests):
        responses = []
        for request in requests:
            # Decode the Byte Tensor into Text
            input = pb_utils.get_input_tensor_by_name(request, "text")
            input_text = input.as_numpy()[0].decode()
            # Call the Model pipeline
            pipeline_output = self.generator(input_text)
            sentiment = pipeline_output[0]["label"]
            # Encode the text to byte tensor to send back
            inference_response = pb_utils.InferenceResponse(
                output_tensors=[
                    pb_utils.Tensor("sentiment", np.array([sentiment.encode()]))]
            )
        responses.append(inference_response)
        return responses

    def finalize(self, args):
        self.generator = None

3. hf-sentiment.tar.gz is a Conda pack of all the dependencies required for Hugging Face. (You can have different tar file name)

conda create -k -y -n hf-sentiment python=3.10

conda activate hf-sentiment

pip install numpy conda-pack

pip install torch==1.13.1

pip install transformers==4.21.3

 optional if you get issue "nvidia triton version `GLIBCXX_3.4.30' not found"
 # conda install -c conda-forge gcc=12.1.0

conda pack -o hf-sentiment.tar.gz

Create files and move them respected director as mentioned in the model repository tree.

The model repository structure is ready. Triton needs three ports:

8000 -> HTTPService

8001 -> GRPCInferenceService,

8002 -> Metrics Service

and volume mounts for the model repository folder. Let’s start the Triton docker container:

docker run -d --shm-size=10G -p 8000:8000 -p 8001:8001 -p 8002:8002 -v $PWD/model_repository:/mnt/model_repository nvcr.io/nvidia/tritonserver:23.06-py3 tritonserver --model-repository=/mnt/model_repository --log-verbose=1

Docker logs:

$ docker ps -a
CONTAINER ID   IMAGE                                   COMMAND                  CREATED        STATUS        PORTS                                                           NAMES
372793f53dda   nvcr.io/nvidia/tritonserver:23.06-py3   "/opt/nvidia/nvidia_…"   18 hours ago   Up 15 hours   0.0.0.0:8000-8002->8000-8002/tcp, :::8000-8002->8000-8002/tcp   nervous_shannon

$ docker logs -f nervous_shannon

I0803 14:45:38.670657 1 server.cc:630] 
+---------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Backend | Path                                                    | Config                                                                                                                                                        |
+---------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
| pytorch | /opt/tritonserver/backends/pytorch/libtriton_pytorch.so | {}                                                                                                                                                            |
| python  | /opt/tritonserver/backends/python/libtriton_python.so   | {"cmdline":{"auto-complete-config":"true","backend-directory":"/opt/tritonserver/backends","min-compute-capability":"6.000000","default-max-batch-size":"4"}} |
+---------+---------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+

I0803 14:45:38.670695 1 server.cc:673] 
+-----------+---------+--------+
| Model     | Version | Status |
+-----------+---------+--------+
| sentiment | 1       | READY  |
+-----------+---------+--------+

I0803 14:45:38.729720 1 metrics.cc:808] Collecting metrics for GPU 0: Tesla V100-SXM2-16GB
I0803 14:45:38.730009 1 metrics.cc:701] Collecting CPU metrics
I0803 14:45:38.730278 1 tritonserver.cc:2385] 
+----------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Option                           | Value                                                                                                                                                                                                           |
+----------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| server_id                        | triton                                                                                                                                                                                                          |
| server_version                   | 2.35.0                                                                                                                                                                                                          |
| server_extensions                | classification sequence model_repository model_repository(unload_dependents) schedule_policy model_configuration system_shared_memory cuda_shared_memory binary_tensor_data parameters statistics trace logging |
| model_repository_path[0]         | /mnt/model_repository                                                                                                                                                                                           |
| model_control_mode               | MODE_NONE                                                                                                                                                                                                       |
| strict_model_config              | 0                                                                                                                                                                                                               |
| rate_limit                       | OFF                                                                                                                                                                                                             |
| pinned_memory_pool_byte_size     | 268435456                                                                                                                                                                                                       |
| cuda_memory_pool_byte_size{0}    | 67108864                                                                                                                                                                                                        |
| min_supported_compute_capability | 6.0                                                                                                                                                                                                             |
| strict_readiness                 | 1                                                                                                                                                                                                               |
| exit_timeout                     | 30                                                                                                                                                                                                              |
| cache_enabled                    | 0                                                                                                                                                                                                               |
+----------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

I0803 14:45:38.730913 1 grpc_server.cc:2339] 
+----------------------------------------------+---------+
| GRPC KeepAlive Option                        | Value   |
+----------------------------------------------+---------+
| keepalive_time_ms                            | 7200000 |
| keepalive_timeout_ms                         | 20000   |
| keepalive_permit_without_calls               | 0       |
| http2_max_pings_without_data                 | 2       |
| http2_min_recv_ping_interval_without_data_ms | 300000  |
| http2_max_ping_strikes                       | 2       |
+----------------------------------------------+---------+

I0803 14:45:38.731608 1 grpc_server.cc:99] Ready for RPC 'Check', 0
I0803 14:45:38.731646 1 grpc_server.cc:99] Ready for RPC 'ServerLive', 0
I0803 14:45:38.731659 1 grpc_server.cc:99] Ready for RPC 'ServerReady', 0
I0803 14:45:38.731664 1 grpc_server.cc:99] Ready for RPC 'ModelReady', 0
I0803 14:45:38.731677 1 grpc_server.cc:99] Ready for RPC 'ServerMetadata', 0
I0803 14:45:38.731690 1 grpc_server.cc:99] Ready for RPC 'ModelMetadata', 0
I0803 14:45:38.731699 1 grpc_server.cc:99] Ready for RPC 'ModelConfig', 0
I0803 14:45:38.731711 1 grpc_server.cc:99] Ready for RPC 'SystemSharedMemoryStatus', 0
I0803 14:45:38.731724 1 grpc_server.cc:99] Ready for RPC 'SystemSharedMemoryRegister', 0
I0803 14:45:38.731737 1 grpc_server.cc:99] Ready for RPC 'SystemSharedMemoryUnregister', 0
I0803 14:45:38.731748 1 grpc_server.cc:99] Ready for RPC 'CudaSharedMemoryStatus', 0
I0803 14:45:38.731755 1 grpc_server.cc:99] Ready for RPC 'CudaSharedMemoryRegister', 0
I0803 14:45:38.731762 1 grpc_server.cc:99] Ready for RPC 'CudaSharedMemoryUnregister', 0
I0803 14:45:38.731774 1 grpc_server.cc:99] Ready for RPC 'RepositoryIndex', 0
I0803 14:45:38.731786 1 grpc_server.cc:99] Ready for RPC 'RepositoryModelLoad', 0
I0803 14:45:38.731792 1 grpc_server.cc:99] Ready for RPC 'RepositoryModelUnload', 0
I0803 14:45:38.731806 1 grpc_server.cc:99] Ready for RPC 'ModelStatistics', 0
I0803 14:45:38.731818 1 grpc_server.cc:99] Ready for RPC 'Trace', 0
I0803 14:45:38.731825 1 grpc_server.cc:99] Ready for RPC 'Logging', 0
I0803 14:45:38.731868 1 grpc_server.cc:348] Thread started for CommonHandler
I0803 14:45:38.732014 1 infer_handler.cc:693] New request handler for ModelInferHandler, 0
I0803 14:45:38.732064 1 infer_handler.h:1046] Thread started for ModelInferHandler
I0803 14:45:38.732201 1 infer_handler.cc:693] New request handler for ModelInferHandler, 0
I0803 14:45:38.732243 1 infer_handler.h:1046] Thread started for ModelInferHandler
I0803 14:45:38.732369 1 stream_infer_handler.cc:127] New request handler for ModelStreamInferHandler, 0
I0803 14:45:38.732403 1 infer_handler.h:1046] Thread started for ModelStreamInferHandler
I0803 14:45:38.732415 1 grpc_server.cc:2445] Started GRPCInferenceService at 0.0.0.0:8001
I0803 14:45:38.732686 1 http_server.cc:3555] Started HTTPService at 0.0.0.0:8000
I0803 14:45:38.774091 1 http_server.cc:185] Started Metrics Service at 0.0.0.0:8002
I0803 14:49:24.762883 1 http_server.cc:3449] HTTP request: 2 /v2/models/sentiment/infer
I0803 14:49:24.762987 1 infer_request.cc:751] [request id: <id_unknown>] prepared: [0x0x7f4454002ed0] request id: , model: sentiment, requested version: -1, actual version: 1, flags: 0x0, correlation id: 0, batch size: 0, priority: 0, timeout (us): 0

The Triton server container is up and running. Let’s try an inference example using CURL:

curl --location --request POST 'http://localhost:8000/v2/models/sentiment/infer' \
 --header 'Content-Type: application/json' \
 --data-raw '{
    "inputs":[
    {    
     "name": "text",
     "shape": [1],
     "datatype": "BYTES",
     "data":  ["I really enjoyed this"]
    }
   ]
 }'

Inference response:

{
  "model_name": "sentiment",
  "model_version": "1",
  "outputs": [
    {
      "name": "sentiment",
      "datatype": "BYTES",
      "shape": [
        1
      ],
      "data": [
        "positive"
      ]
    }
  ]
}

References:

What Is RAG?

A Simple Architecture: How It Works

Step 1: Parsing & Chunking

Step 2: Embedding with SentenceTransformer

Step 3: Indexing & Semantic Search with LanceDB

Step 4: Prompt Template

Step 5: LLM Inference Using Qwen

Step 6: Putting It All Together

Final Output Example

Why This Matters

Share this:

Share this:

Share this: