Create Datasets for RAG & LLM Fine-Tuning

Easily prepare datasets from multiple sources for advanced AI applications.

Available Sources

  • Local Folder
  • Google Cloud Bucket
  • S3 Bucket

Code Examples


import boto3
from google.cloud import storage
import os

# Function to process files from a local directory
def process_local_files(folder_path, output_path):
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if os.path.isfile(file_path):
            # Process file and save to output_path
            print(f"Processing file: {file_path}")
            # Custom logic for preparing dataset for RAG or fine-tuning

# Function to process files from Google Cloud Bucket
def process_gcs(bucket_name, output_path):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blobs = bucket.list_blobs()
    for blob in blobs:
        print(f"Processing file: {blob.name}")
        # Custom logic for preparing dataset for RAG or fine-tuning

# Function to process files from S3 Bucket
def process_s3(bucket_name, output_path):
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(Bucket=bucket_name)
    for obj in response.get('Contents', []):
        print(f"Processing file: {obj['Key']}")
        # Custom logic for preparing dataset for RAG or fine-tuning

# Example usage
if __name__ == "__main__":
    LOCAL_FOLDER = "path/to/local/folder"
    OUTPUT_PATH = "path/to/output/folder"
    GCS_BUCKET = "your-gcs-bucket-name"
    S3_BUCKET = "your-s3-bucket-name"

    process_local_files(LOCAL_FOLDER, OUTPUT_PATH)
    process_gcs(GCS_BUCKET, OUTPUT_PATH)
    process_s3(S3_BUCKET, OUTPUT_PATH)