This guide demonstrates how to use JamAI Base SDK to upload and embed files into Knowledge Tables for AI-powered document processing and retrieval.
Knowledge Tables are specialized tables in JamAI Base that provide hybrid-search capabilities through both full-text search (FTS) and vector embeddings:
Copy pip install jamaibase python-dotenv
Copy from jamaibase import JamAI, protocol as p
from dotenv import load_dotenv
import os
# Load environment variables
load_dotenv()
PROJECT_ID = "your_project_id"
PAT = os.getenv("PAT")
client = JamAI(
project_id=PROJECT_ID,
token=PAT
)
Copy from typing import Optional, Dict, List
import os
class DocumentUploader:
def __init__(self, project_id: str, pat: str):
"""Initialize the document uploader"""
self.client = JamAI(
project_id=project_id,
token=pat
)
def validate_file(self, file_path: str) -> bool:
"""Validate if file exists and has supported format"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
supported_types = [
'.csv', '.tsv', '.txt', '.md',
'.doc', '.docx', '.pdf',
'.ppt', '.pptx',
'.xls', '.xlsx',
'.xml', '.html',
'.json', '.jsonl'
]
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext not in supported_types:
raise ValueError(
f"Unsupported file format. Supported formats: {', '.join(supported_types)}"
)
return True
def get_mime_type(self, file_path: str) -> str:
"""Get MIME type of the file"""
mime_types = {
'.csv': 'text/csv',
'.tsv': 'text/tab-separated-values',
'.txt': 'text/plain',
'.md': 'text/markdown',
'.doc': 'application/msword',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.pdf': 'application/pdf',
'.ppt': 'application/vnd.ms-powerpoint',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.xls': 'application/vnd.ms-excel',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xml': 'application/xml',
'.html': 'text/html',
'.json': 'application/json',
'.jsonl': 'application/jsonl'
}
file_ext = os.path.splitext(file_path)[1].lower()
return mime_types.get(file_ext, 'application/octet-stream')
def get_optimal_chunk_settings(self, file_path: str) -> Dict[str, int]:
"""Determine optimal chunk settings based on file type"""
file_ext = os.path.splitext(file_path)[1].lower()
settings = {
# Text-based documents
'.txt': {'size': 1000, 'overlap': 200},
'.md': {'size': 1000, 'overlap': 200},
'.csv': {'size': 800, 'overlap': 150},
'.tsv': {'size': 800, 'overlap': 150},
# Rich text documents
'.doc': {'size': 1200, 'overlap': 250},
'.docx': {'size': 1200, 'overlap': 250},
'.pdf': {'size': 1500, 'overlap': 300},
# Presentations
'.ppt': {'size': 1000, 'overlap': 200},
'.pptx': {'size': 1000, 'overlap': 200},
# Spreadsheets
'.xls': {'size': 800, 'overlap': 150},
'.xlsx': {'size': 800, 'overlap': 150},
# Markup/structured documents
'.xml': {'size': 1000, 'overlap': 200},
'.html': {'size': 1000, 'overlap': 200},
'.json': {'size': 800, 'overlap': 150},
'.jsonl': {'size': 800, 'overlap': 150},
# Default settings
'default': {'size': 1000, 'overlap': 200}
}
return settings.get(file_ext, settings['default'])
def upload_document(self, file_path: str, table_id: str,
custom_chunk_size: Optional[int] = None,
custom_chunk_overlap: Optional[int] = None) -> bool:
"""Upload single document with optimized settings"""
try:
# Validate file
self.validate_file(file_path)
# Get file information
file_name = os.path.basename(file_path)
file_size = os.path.getsize(file_path)
mime_type = self.get_mime_type(file_path)
# Get chunk settings
settings = self.get_optimal_chunk_settings(file_path)
chunk_size = custom_chunk_size or settings['size']
chunk_overlap = custom_chunk_overlap or settings['overlap']
print(f"Uploading: {file_name}")
print(f"File type: {mime_type}")
print(f"File size: {file_size / 1024:.2f} KB")
print(f"Chunk size: {chunk_size}, Overlap: {chunk_overlap}")
# Upload and embed file
response = self.client.table.embed_file(
file_path=file_path,
table_id=table_id,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
print(f"Upload successful: {file_name}")
return True
except Exception as e:
print(f"Error uploading {file_path}: {str(e)}")
return False
Copy import os
import argparse
from jamaibase import JamAI, protocol as p
from typing import Optional, Dict, List
from dotenv import load_dotenv
class DocumentUploader:
def __init__(self, project_id: str, pat: str):
"""Initialize the document uploader"""
self.client = JamAI(
project_id=project_id,
token=pat
)
def validate_file(self, file_path: str) -> bool:
"""Validate if file exists and has supported format"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
supported_types = [
'.csv', '.tsv', '.txt', '.md',
'.doc', '.docx', '.pdf',
'.ppt', '.pptx',
'.xls', '.xlsx',
'.xml', '.html',
'.json', '.jsonl'
]
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext not in supported_types:
raise ValueError(
f"Unsupported file format. Supported formats: {', '.join(supported_types)}"
)
return True
def get_mime_type(self, file_path: str) -> str:
"""Get MIME type of the file"""
mime_types = {
'.csv': 'text/csv',
'.tsv': 'text/tab-separated-values',
'.txt': 'text/plain',
'.md': 'text/markdown',
'.doc': 'application/msword',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.pdf': 'application/pdf',
'.ppt': 'application/vnd.ms-powerpoint',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.xls': 'application/vnd.ms-excel',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xml': 'application/xml',
'.html': 'text/html',
'.json': 'application/json',
'.jsonl': 'application/jsonl'
}
file_ext = os.path.splitext(file_path)[1].lower()
return mime_types.get(file_ext, 'application/octet-stream')
def get_optimal_chunk_settings(self, file_path: str) -> Dict[str, int]:
"""Determine optimal chunk settings based on file type"""
file_ext = os.path.splitext(file_path)[1].lower()
settings = {
# Text-based documents
'.txt': {'size': 1000, 'overlap': 200},
'.md': {'size': 1000, 'overlap': 200},
'.csv': {'size': 800, 'overlap': 150},
'.tsv': {'size': 800, 'overlap': 150},
# Rich text documents
'.doc': {'size': 1200, 'overlap': 250},
'.docx': {'size': 1200, 'overlap': 250},
'.pdf': {'size': 1500, 'overlap': 300},
# Presentations
'.ppt': {'size': 1000, 'overlap': 200},
'.pptx': {'size': 1000, 'overlap': 200},
# Spreadsheets
'.xls': {'size': 800, 'overlap': 150},
'.xlsx': {'size': 800, 'overlap': 150},
# Markup/structured documents
'.xml': {'size': 1000, 'overlap': 200},
'.html': {'size': 1000, 'overlap': 200},
'.json': {'size': 800, 'overlap': 150},
'.jsonl': {'size': 800, 'overlap': 150},
# Default settings
'default': {'size': 1000, 'overlap': 200}
}
return settings.get(file_ext, settings['default'])
def upload_document(self, file_path: str, table_id: str,
custom_chunk_size: Optional[int] = None,
custom_chunk_overlap: Optional[int] = None) -> bool:
"""Upload single document with optimized settings"""
try:
# Validate file
self.validate_file(file_path)
# Get file information
file_name = os.path.basename(file_path)
file_size = os.path.getsize(file_path)
mime_type = self.get_mime_type(file_path)
# Get chunk settings
settings = self.get_optimal_chunk_settings(file_path)
chunk_size = custom_chunk_size or settings['size']
chunk_overlap = custom_chunk_overlap or settings['overlap']
print(f"Uploading: {file_name}")
print(f"File type: {mime_type}")
print(f"File size: {file_size / 1024:.2f} KB")
print(f"Chunk size: {chunk_size}, Overlap: {chunk_overlap}")
# Upload and embed file
response = self.client.table.embed_file(
file_path=file_path,
table_id=table_id,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
print(f"Upload successful: {file_name}")
return True
except Exception as e:
print(f"Error uploading {file_path}: {str(e)}")
return False
def process_folder(folder_path: str, uploader: DocumentUploader,
table_id: str, chunk_size: Optional[int] = None,
chunk_overlap: Optional[int] = None) -> tuple[List[str], List[str]]:
"""Process all documents in a folder"""
successful = []
failed = []
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
if os.path.isfile(file_path):
if uploader.upload_document(
file_path,
table_id,
chunk_size,
chunk_overlap
):
successful.append(filename)
else:
failed.append(filename)
return successful, failed
def main():
# Set up argument parser
parser = argparse.ArgumentParser(
description='Upload documents to JamAI Base Knowledge Table'
)
parser.add_argument('--project-id', required=True,
help='Your JamAI Base project ID')
parser.add_argument('--pat', required=True,
help='Your Personal Access Token')
parser.add_argument('--table-id', required=True,
help='Knowledge Table ID')
parser.add_argument('--input', required=True,
help='Path to file or folder')
parser.add_argument('--chunk-size', type=int,
help='Custom chunk size')
parser.add_argument('--chunk-overlap', type=int,
help='Custom chunk overlap')
args = parser.parse_args()
# Initialize uploader
uploader = DocumentUploader(args.project_id, args.pat)
# Process input
if os.path.isfile(args.input):
# Single file processing
success = uploader.upload_document(
args.input,
args.table_id,
args.chunk_size,
args.chunk_overlap
)
print(f"\nFinal Status: Upload {'successful' if success else 'failed'}")
else:
# Folder processing
successful, failed = process_folder(
args.input,
uploader,
args.table_id,
args.chunk_size,
args.chunk_overlap
)
print("\nUpload Summary:")
print(f"Successful: {len(successful)} files")
print(f"Failed: {len(failed)} files")
if successful:
print("\nSuccessfully uploaded files:")
for file in successful:
print(f"- {file}")
if failed:
print("\nFailed uploads:")
for file in failed:
print(f"- {file}")
if __name__ == "__main__":
main()
Copy python knowledge_uploader.py \
--project-id "your_project_id" \
--pat "your_pat" \
--table-id "your_table_id" \
--input "path/to/document.pdf"
Copy python knowledge_uploader.py \
--project-id "your_project_id" \
--pat "your_pat" \
--table-id "your_table_id" \
--input "path/to/documents/folder"
Copy python knowledge_uploader.py \
--project-id "your_project_id" \
--pat "your_pat" \
--table-id "your_table_id" \
--input "path/to/document.pdf" \
--chunk-size 2000 \
--chunk-overlap 400
Copy Uploading: document.pdf
File type: application/pdf
File size: 1024.50 KB
Chunk size: 1500, Overlap: 300
Upload successful: document.pdf
Final Status: Upload successful
Copy Upload Summary:
Successful: 3 files
Failed: 1 files
Successfully uploaded files:
- document1.pdf
- document2.docx
- presentation.pptx
Failed uploads:
- invalid_format.xyz
This implementation provides a robust foundation for uploading documents to JamAI Base Knowledge Tables, with support for all official file types and optimal processing settings for each format.