Skip to main content
LlamaIndex is a popular data framework for LLM applications — RAG pipelines, agents, and more. DeepInfra integrates with LlamaIndex for both LLMs and embeddings.

LLMs

Installation

pip install llama-index-llms-deepinfra

Initialization

from llama_index.llms.deepinfra import DeepInfraLLM
import asyncio

llm = DeepInfraLLM(
    model="deepseek-ai/DeepSeek-V3",
    api_key="$DEEPINFRA_TOKEN",
    temperature=0.5,
    max_tokens=50,
    additional_kwargs={"top_p": 0.9},
)

Synchronous

# Complete
response = llm.complete("Hello World!")
print(response.text)

# Stream complete
for completion in llm.stream_complete("Once upon a time"):
    print(completion.delta, end="")

# Chat
from llama_index.core.base.llms.types import ChatMessage

messages = [ChatMessage(role="user", content="Tell me a joke.")]
chat_response = llm.chat(messages)
print(chat_response.message.content)

# Stream chat
messages = [
    ChatMessage(role="system", content="You are a helpful assistant."),
    ChatMessage(role="user", content="Tell me a story."),
]
for chat_response in llm.stream_chat(messages):
    print(chat_response.delta, end="")

Asynchronous

# Async complete
async def async_complete():
    response = await llm.acomplete("Hello Async World!")
    print(response.text)

asyncio.run(async_complete())

# Async stream complete
async def async_stream_complete():
    response = await llm.astream_complete("Once upon an async time")
    async for completion in response:
        print(completion.delta, end="")

asyncio.run(async_stream_complete())

# Async chat
async def async_chat():
    messages = [ChatMessage(role="user", content="Tell me an async joke.")]
    chat_response = await llm.achat(messages)
    print(chat_response.message.content)

asyncio.run(async_chat())

# Async stream chat
async def async_stream_chat():
    messages = [
        ChatMessage(role="system", content="You are a helpful assistant."),
        ChatMessage(role="user", content="Tell me an async story."),
    ]
    response = await llm.astream_chat(messages)
    async for chat_response in response:
        print(chat_response.delta, end="")

asyncio.run(async_stream_chat())

Embeddings

Installation

pip install llama-index llama-index-embeddings-deepinfra

Initialization

from dotenv import load_dotenv, find_dotenv
from llama_index.embeddings.deepinfra import DeepInfraEmbeddingModel

_ = load_dotenv(find_dotenv())

model = DeepInfraEmbeddingModel(
    model_id="Qwen/Qwen3-Embedding-8B",
    api_token="$DEEPINFRA_TOKEN",
    normalize=True,
    text_prefix="text: ",
    query_prefix="query: ",
)

Synchronous requests

# Single text
response = model.get_text_embedding("hello world")
print(response)

# Batch
texts = ["hello world", "goodbye world"]
response_batch = model.get_text_embedding_batch(texts)
print(response_batch)

# Query
query_response = model.get_query_embedding("hello world")
print(query_response)

Asynchronous requests

import asyncio

async def main():
    async_response = await model.aget_text_embedding("hello world")
    print(async_response)

asyncio.run(main())