Get started with FilePrepper in 5 minutes - either as a CLI tool or SDK.
# Install globally
dotnet tool install -g fileprepper-cli
# Verify installation
fileprepper --version
# Expected: 0.4.9
fileprepper --help# Add to your project
dotnet add package FilePrepper
# Or in .csproj
<PackageReference Include="FilePrepper" Version="0.4.9" />git clone https://github.com/iyulab/FilePrepper.git
cd FilePrepper
dotnet build src/FilePrepper.sln
# Run CLI from source
cd src/FilePrepper.CLI
dotnet run -- --helpCreate sample.csv:
Name,Age,Salary,Department
Alice,25,50000,Engineering
Bob,30,75000,Sales
Charlie,35,60000,Engineering
David,28,,Sales
Eve,32,70000,Marketing# If installed as global tool
fileprepper normalize -i sample.csv -o output.csv -c "Age,Salary" -m MinMax
# Or if running from source
cd src/FilePrepper.CLI
dotnet run -- normalize -i sample.csv -o output.csv -c "Age,Salary" -m MinMaxResult:
Name,Age,Salary,Department
Alice,0,0,Engineering
Bob,0.5,1,Sales
Charlie,1,0.4,Engineering
David,0.3,0,Sales
Eve,0.7,0.8,Marketingfileprepper fill-missing -i sample.csv -o output.csv -c "Salary" -m Mean# Remove unnecessary columns
fileprepper remove-columns -i data.csv -o clean.csv -c "TempCol,Debug,Notes"
# Drop duplicates
fileprepper drop-duplicates -i data.csv -o unique.csv -c "Email"
# Fill missing values
fileprepper fill-missing -i data.csv -o filled.csv -c "Age,Salary" -m Mean# Convert types
fileprepper convert-type -i data.csv -o typed.csv \
-c "Date:DateTime:yyyy-MM-dd,Age:Integer"
# Normalize values
fileprepper normalize -i data.csv -o norm.csv -c "Price,Quantity" -m MinMax
# Extract date components
fileprepper extract-date -i data.csv -o dated.csv \
--column "OrderDate" --components "Year,Month,Day"# Calculate statistics
fileprepper stats -i data.csv -o stats.csv -c "Age,Salary,Score"
# Group and aggregate
fileprepper aggregate -i sales.csv -o summary.csv \
--group "Region,Product" \
--aggregations "Sales:Sum,Quantity:Avg"# Merge files vertically
fileprepper merge file1.csv file2.csv file3.csv -o merged.csv -t Vertical
# Convert format
fileprepper convert-format -i data.csv -o data.json -f JSONProcess multiple columns in a single command:
# Normalize 5 columns simultaneously
fileprepper normalize -i data.csv -o output.csv \
-c "Col1,Col2,Col3,Col4,Col5" -m ZScore
# Convert 3 types at once
fileprepper convert-type -i data.csv -o output.csv \
-c "Date:DateTime,Age:Integer,Price:Decimal"
# Fill missing in multiple columns
fileprepper fill-missing -i data.csv -o output.csv \
-c "Age,Salary,Score" -m MedianCombine commands for complex workflows:
# Step 1: Clean data
fileprepper fill-missing -i raw.csv -o step1.csv -c "Age,Salary" -m Mean
# Step 2: Remove outliers
fileprepper filter-rows -i step1.csv -o step2.csv \
--column "Age" --operator LessThan --value "100"
# Step 3: Normalize
fileprepper normalize -i step2.csv -o step3.csv -c "Age,Salary" -m MinMax
# Step 4: Convert to JSON
fileprepper convert-format -i step3.csv -o final.json -f JSON
# Total: 8 file I/O operations (4 reads + 4 writes)Use the Pipeline API for in-memory processing:
using FilePrepper.Pipeline;
// Same workflow with only 2 file I/O operations!
await DataPipeline
.FromCsvAsync("raw.csv")
.FillMissing(columns: new[] { "Age", "Salary" }, method: FillMethod.Mean)
.FilterRows(row => int.Parse(row["Age"]) < 100)
.Normalize(columns: new[] { "Age", "Salary" }, method: NormalizationMethod.MinMax)
.ToCsvAsync("final.csv"); // Or .ToJson() for JSON output
// 75% reduction in file I/O!ML Feature Engineering:
using FilePrepper.Pipeline;
var result = await DataPipeline
.FromCsvAsync("data.csv")
.AddColumn("AgeGroup", row =>
int.Parse(row["Age"]) < 30 ? "Young" : "Senior")
.Normalize(columns: new[] { "Age", "Salary" },
method: NormalizationMethod.MinMax)
.ToDataFrame();
Console.WriteLine($"Processed {result.RowCount} rows");In-Memory Processing (Zero File I/O):
var data = new List<Dictionary<string, string>>
{
new() { ["Name"] = "Alice", ["Age"] = "25", ["Salary"] = "50000" },
new() { ["Name"] = "Bob", ["Age"] = "30", ["Salary"] = "60000" }
};
var processed = DataPipeline
.FromData(data)
.Normalize(columns: new[] { "Age", "Salary" },
method: NormalizationMethod.MinMax)
.FilterRows(row => double.Parse(row["Age"]) > 0.5)
.ToDataFrame();
// Access results directly - no files needed!
foreach (var row in processed.Rows)
{
Console.WriteLine($"{row["Name"]}: Age={row["Age"]}, Salary={row["Salary"]}");
}Handle dirty data gracefully:
# Ignore errors and use defaults
fileprepper normalize -i messy.csv -o clean.csv \
-c "Age,Salary" -m MinMax \
--ignore-errors --default-value "0"fileprepper --helpfileprepper normalize --help
fileprepper aggregate --helpfileprepper --help | grep " "- CLI Reference - Complete command reference
- Common Scenarios - Real-world examples
- API Reference - Programmatic usage
- Test on small files first - Verify your command works
- Use
--help- Every command has detailed documentation - Check headers - Ensure
--has-headermatches your file - Quote column names - Use quotes for names with spaces
- Backup originals - Keep copies before transforming
Install as global tool:
# Install globally
dotnet tool install -g fileprepper-cli
# Or run from source
cd src/FilePrepper.CLI
dotnet run -- <command>- Check column names match exactly (case-sensitive)
- Verify header row exists with
--has-header true - Use quotes for column names with spaces
- Process file in smaller batches
- Use appropriate data types
- Consider streaming solutions for very large files