Create Datasets for RAG & LLM Fine-Tuning
Easily prepare datasets from multiple sources for advanced AI applications.
Available Sources
- Local Folder
- Google Cloud Bucket
- S3 Bucket
Code Examples
import boto3
from google.cloud import storage
import os
# Function to process files from a local directory
def process_local_files(folder_path, output_path):
for file_name in os.listdir(folder_path):
file_path = os.path.join(folder_path, file_name)
if os.path.isfile(file_path):
# Process file and save to output_path
print(f"Processing file: {file_path}")
# Custom logic for preparing dataset for RAG or fine-tuning
# Function to process files from Google Cloud Bucket
def process_gcs(bucket_name, output_path):
client = storage.Client()
bucket = client.bucket(bucket_name)
blobs = bucket.list_blobs()
for blob in blobs:
print(f"Processing file: {blob.name}")
# Custom logic for preparing dataset for RAG or fine-tuning
# Function to process files from S3 Bucket
def process_s3(bucket_name, output_path):
s3 = boto3.client('s3')
response = s3.list_objects_v2(Bucket=bucket_name)
for obj in response.get('Contents', []):
print(f"Processing file: {obj['Key']}")
# Custom logic for preparing dataset for RAG or fine-tuning
# Example usage
if __name__ == "__main__":
LOCAL_FOLDER = "path/to/local/folder"
OUTPUT_PATH = "path/to/output/folder"
GCS_BUCKET = "your-gcs-bucket-name"
S3_BUCKET = "your-s3-bucket-name"
process_local_files(LOCAL_FOLDER, OUTPUT_PATH)
process_gcs(GCS_BUCKET, OUTPUT_PATH)
process_s3(S3_BUCKET, OUTPUT_PATH)