Quick Start: Knowledge Table File Upload
Prepare your file for RAG
1. Introduction
This guide demonstrates how to use JamAI Base SDK to upload and embed files into Knowledge Tables for AI-powered document processing and retrieval.
What are Knowledge Tables?
Knowledge Tables are specialized tables in JamAI Base that provide hybrid-search capabilities through both full-text search (FTS) and vector embeddings:
Search Capabilities:
Full-Text Search (FTS): Traditional keyword-based search for exact and partial matches
Semantic Search: Vector embedding-based search for meaning and context
Document Processing:
Automatically chunks documents into manageable segments
Generates vector embeddings for semantic understanding
Indexes content for full-text search
Preserves document structure(tables, layouts, etc) and metadata
Use Cases:
Document retrieval using both keywords and semantic meaning
Question-answering agent
Content recommendation
Knowledge base search and discovery
Supported File Types
The following file formats are supported:
Text files:
.txt
,.md
,.csv
,.tsv
Documents:
.doc
,.docx
,.pdf
Presentations:
.ppt
,.pptx
Spreadsheets:
.xls
,.xlsx
Markup/Data:
.xml
,.html
,.json
,.jsonl
Prerequisites
Before starting, you'll need:
Python 3.10 or higher
Project ID and Personal Access Token (PAT)
Documents to process
2. Installation and Setup
Installing Required Packages
pip install jamaibase python-dotenv
Basic Configuration
from jamaibase import JamAI, protocol as p
from dotenv import load_dotenv
import os
# Load environment variables
load_dotenv()
PROJECT_ID = "your_project_id"
PAT = os.getenv("PAT")
client = JamAI(
project_id=PROJECT_ID,
token=PAT
)
3. Creating Your Knowledge Table
Navigate to your JamAI Base knowledge tables tab
Create a new knowledge table
Note down the table ID for later use
4. Implementation
4.1 Complete Document Uploader Class
from typing import Optional, Dict, List
import os
class DocumentUploader:
def __init__(self, project_id: str, pat: str):
"""Initialize the document uploader"""
self.client = JamAI(
project_id=project_id,
token=pat
)
def validate_file(self, file_path: str) -> bool:
"""Validate if file exists and has supported format"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
supported_types = [
'.csv', '.tsv', '.txt', '.md',
'.doc', '.docx', '.pdf',
'.ppt', '.pptx',
'.xls', '.xlsx',
'.xml', '.html',
'.json', '.jsonl'
]
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext not in supported_types:
raise ValueError(
f"Unsupported file format. Supported formats: {', '.join(supported_types)}"
)
return True
def get_mime_type(self, file_path: str) -> str:
"""Get MIME type of the file"""
mime_types = {
'.csv': 'text/csv',
'.tsv': 'text/tab-separated-values',
'.txt': 'text/plain',
'.md': 'text/markdown',
'.doc': 'application/msword',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.pdf': 'application/pdf',
'.ppt': 'application/vnd.ms-powerpoint',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.xls': 'application/vnd.ms-excel',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xml': 'application/xml',
'.html': 'text/html',
'.json': 'application/json',
'.jsonl': 'application/jsonl'
}
file_ext = os.path.splitext(file_path)[1].lower()
return mime_types.get(file_ext, 'application/octet-stream')
def get_optimal_chunk_settings(self, file_path: str) -> Dict[str, int]:
"""Determine optimal chunk settings based on file type"""
file_ext = os.path.splitext(file_path)[1].lower()
settings = {
# Text-based documents
'.txt': {'size': 1000, 'overlap': 200},
'.md': {'size': 1000, 'overlap': 200},
'.csv': {'size': 800, 'overlap': 150},
'.tsv': {'size': 800, 'overlap': 150},
# Rich text documents
'.doc': {'size': 1200, 'overlap': 250},
'.docx': {'size': 1200, 'overlap': 250},
'.pdf': {'size': 1500, 'overlap': 300},
# Presentations
'.ppt': {'size': 1000, 'overlap': 200},
'.pptx': {'size': 1000, 'overlap': 200},
# Spreadsheets
'.xls': {'size': 800, 'overlap': 150},
'.xlsx': {'size': 800, 'overlap': 150},
# Markup/structured documents
'.xml': {'size': 1000, 'overlap': 200},
'.html': {'size': 1000, 'overlap': 200},
'.json': {'size': 800, 'overlap': 150},
'.jsonl': {'size': 800, 'overlap': 150},
# Default settings
'default': {'size': 1000, 'overlap': 200}
}
return settings.get(file_ext, settings['default'])
def upload_document(self, file_path: str, table_id: str,
custom_chunk_size: Optional[int] = None,
custom_chunk_overlap: Optional[int] = None) -> bool:
"""Upload single document with optimized settings"""
try:
# Validate file
self.validate_file(file_path)
# Get file information
file_name = os.path.basename(file_path)
file_size = os.path.getsize(file_path)
mime_type = self.get_mime_type(file_path)
# Get chunk settings
settings = self.get_optimal_chunk_settings(file_path)
chunk_size = custom_chunk_size or settings['size']
chunk_overlap = custom_chunk_overlap or settings['overlap']
print(f"Uploading: {file_name}")
print(f"File type: {mime_type}")
print(f"File size: {file_size / 1024:.2f} KB")
print(f"Chunk size: {chunk_size}, Overlap: {chunk_overlap}")
# Upload and embed file
response = self.client.table.embed_file(
file_path=file_path,
table_id=table_id,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
print(f"Upload successful: {file_name}")
return True
except Exception as e:
print(f"Error uploading {file_path}: {str(e)}")
return False
5. Complete Standalone Script
Save this as knowledge_uploader.py
:
import os
import argparse
from jamaibase import JamAI, protocol as p
from typing import Optional, Dict, List
from dotenv import load_dotenv
class DocumentUploader:
def __init__(self, project_id: str, pat: str):
"""Initialize the document uploader"""
self.client = JamAI(
project_id=project_id,
token=pat
)
def validate_file(self, file_path: str) -> bool:
"""Validate if file exists and has supported format"""
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found: {file_path}")
supported_types = [
'.csv', '.tsv', '.txt', '.md',
'.doc', '.docx', '.pdf',
'.ppt', '.pptx',
'.xls', '.xlsx',
'.xml', '.html',
'.json', '.jsonl'
]
file_ext = os.path.splitext(file_path)[1].lower()
if file_ext not in supported_types:
raise ValueError(
f"Unsupported file format. Supported formats: {', '.join(supported_types)}"
)
return True
def get_mime_type(self, file_path: str) -> str:
"""Get MIME type of the file"""
mime_types = {
'.csv': 'text/csv',
'.tsv': 'text/tab-separated-values',
'.txt': 'text/plain',
'.md': 'text/markdown',
'.doc': 'application/msword',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.pdf': 'application/pdf',
'.ppt': 'application/vnd.ms-powerpoint',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.xls': 'application/vnd.ms-excel',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.xml': 'application/xml',
'.html': 'text/html',
'.json': 'application/json',
'.jsonl': 'application/jsonl'
}
file_ext = os.path.splitext(file_path)[1].lower()
return mime_types.get(file_ext, 'application/octet-stream')
def get_optimal_chunk_settings(self, file_path: str) -> Dict[str, int]:
"""Determine optimal chunk settings based on file type"""
file_ext = os.path.splitext(file_path)[1].lower()
settings = {
# Text-based documents
'.txt': {'size': 1000, 'overlap': 200},
'.md': {'size': 1000, 'overlap': 200},
'.csv': {'size': 800, 'overlap': 150},
'.tsv': {'size': 800, 'overlap': 150},
# Rich text documents
'.doc': {'size': 1200, 'overlap': 250},
'.docx': {'size': 1200, 'overlap': 250},
'.pdf': {'size': 1500, 'overlap': 300},
# Presentations
'.ppt': {'size': 1000, 'overlap': 200},
'.pptx': {'size': 1000, 'overlap': 200},
# Spreadsheets
'.xls': {'size': 800, 'overlap': 150},
'.xlsx': {'size': 800, 'overlap': 150},
# Markup/structured documents
'.xml': {'size': 1000, 'overlap': 200},
'.html': {'size': 1000, 'overlap': 200},
'.json': {'size': 800, 'overlap': 150},
'.jsonl': {'size': 800, 'overlap': 150},
# Default settings
'default': {'size': 1000, 'overlap': 200}
}
return settings.get(file_ext, settings['default'])
def upload_document(self, file_path: str, table_id: str,
custom_chunk_size: Optional[int] = None,
custom_chunk_overlap: Optional[int] = None) -> bool:
"""Upload single document with optimized settings"""
try:
# Validate file
self.validate_file(file_path)
# Get file information
file_name = os.path.basename(file_path)
file_size = os.path.getsize(file_path)
mime_type = self.get_mime_type(file_path)
# Get chunk settings
settings = self.get_optimal_chunk_settings(file_path)
chunk_size = custom_chunk_size or settings['size']
chunk_overlap = custom_chunk_overlap or settings['overlap']
print(f"Uploading: {file_name}")
print(f"File type: {mime_type}")
print(f"File size: {file_size / 1024:.2f} KB")
print(f"Chunk size: {chunk_size}, Overlap: {chunk_overlap}")
# Upload and embed file
response = self.client.table.embed_file(
file_path=file_path,
table_id=table_id,
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
)
print(f"Upload successful: {file_name}")
return True
except Exception as e:
print(f"Error uploading {file_path}: {str(e)}")
return False
def process_folder(folder_path: str, uploader: DocumentUploader,
table_id: str, chunk_size: Optional[int] = None,
chunk_overlap: Optional[int] = None) -> tuple[List[str], List[str]]:
"""Process all documents in a folder"""
successful = []
failed = []
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
if os.path.isfile(file_path):
if uploader.upload_document(
file_path,
table_id,
chunk_size,
chunk_overlap
):
successful.append(filename)
else:
failed.append(filename)
return successful, failed
def main():
# Set up argument parser
parser = argparse.ArgumentParser(
description='Upload documents to JamAI Base Knowledge Table'
)
parser.add_argument('--project-id', required=True,
help='Your JamAI Base project ID')
parser.add_argument('--pat', required=True,
help='Your Personal Access Token')
parser.add_argument('--table-id', required=True,
help='Knowledge Table ID')
parser.add_argument('--input', required=True,
help='Path to file or folder')
parser.add_argument('--chunk-size', type=int,
help='Custom chunk size')
parser.add_argument('--chunk-overlap', type=int,
help='Custom chunk overlap')
args = parser.parse_args()
# Initialize uploader
uploader = DocumentUploader(args.project_id, args.pat)
# Process input
if os.path.isfile(args.input):
# Single file processing
success = uploader.upload_document(
args.input,
args.table_id,
args.chunk_size,
args.chunk_overlap
)
print(f"\nFinal Status: Upload {'successful' if success else 'failed'}")
else:
# Folder processing
successful, failed = process_folder(
args.input,
uploader,
args.table_id,
args.chunk_size,
args.chunk_overlap
)
print("\nUpload Summary:")
print(f"Successful: {len(successful)} files")
print(f"Failed: {len(failed)} files")
if successful:
print("\nSuccessfully uploaded files:")
for file in successful:
print(f"- {file}")
if failed:
print("\nFailed uploads:")
for file in failed:
print(f"- {file}")
if __name__ == "__main__":
main()
6. Usage Examples
Single File Upload
python knowledge_uploader.py \
--project-id "your_project_id" \
--pat "your_pat" \
--table-id "your_table_id" \
--input "path/to/document.pdf"
Folder Upload
python knowledge_uploader.py \
--project-id "your_project_id" \
--pat "your_pat" \
--table-id "your_table_id" \
--input "path/to/documents/folder"
Custom Chunk Settings
python knowledge_uploader.py \
--project-id "your_project_id" \
--pat "your_pat" \
--table-id "your_table_id" \
--input "path/to/document.pdf" \
--chunk-size 2000 \
--chunk-overlap 400
7. Example Output
Uploading: document.pdf
File type: application/pdf
File size: 1024.50 KB
Chunk size: 1500, Overlap: 300
Upload successful: document.pdf
Final Status: Upload successful
For folder processing:
Upload Summary:
Successful: 3 files
Failed: 1 files
Successfully uploaded files:
- document1.pdf
- document2.docx
- presentation.pptx
Failed uploads:
- invalid_format.xyz
8. Best Practices
File Handling
Always validate files before upload
Use appropriate chunk sizes for different document types
Handle large files appropriately
Performance
Reuse the client instance
Process files in batches
Consider implementing rate limiting for large batches
Error Handling
Validate input files
Handle network errors gracefully
Provide meaningful error messages
Security
Use environment variables for credentials
Validate file content when necessary
Implement proper access controls
This implementation provides a robust foundation for uploading documents to JamAI Base Knowledge Tables, with support for all official file types and optimal processing settings for each format.
Last updated
Was this helpful?