""" Example: Analyze your own gene expression data with auto-reporting. This script demonstrates how to: 1. Validate your own data files 2. Run analysis with auto-timestamped outputs 3. Generate comprehensive analysis reports 4. Interpret results using the automated README """ import sys from pathlib import Path import time # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from stdm import ( DataLoader, TensorBuilder, CPDecomposition, DataValidator, AnalysisReport, create_timestamped_output_dir ) from stdm.utils import save_results, compute_explained_variance def analyze_custom_data( data_path: str, output_base_dir: str = "results", rank: int = 10, normalize: bool = True ): """ Analyze custom gene expression data with full reporting. Parameters ---------- data_path : str Path to your data file (merged CSV) or directory (separate files) output_base_dir : str Base directory for results rank : int Number of components for decomposition normalize : bool Whether to normalize data """ print("=" * 70) print("Custom Data Analysis with STDM") print("=" * 70) # Step 1: Validate your data print(f"\nšŸ“‹ Step 1: Validating your data...") print(f" Data path: {data_path}") validator = DataValidator(strict=False) data_path_obj = Path(data_path) if not data_path_obj.exists(): print(f"\nāŒ Error: Path not found: {data_path}") print("\nšŸ’” Tips:") print(" - Check the file/directory path is correct") print(" - Use absolute path if relative path doesn't work") print(" - Ensure file has read permissions") return # Validate based on type if data_path_obj.is_file(): print(" Detected: Merged data file") is_valid, messages = validator.validate_merged_data(data_path) elif data_path_obj.is_dir(): print(" Detected: Directory with separate files") is_valid, messages = validator.validate_separate_data(data_path) else: print(f"\nāŒ Error: {data_path} is neither a file nor directory") return # Print validation report validator.print_report() if not is_valid: print("\nāŒ Data validation failed!") print("\nšŸ’” Common issues:") print(" - Non-numeric values in expression data") print(" - Missing or malformed sample names") print(" - Inconsistent gene IDs across files") print(" - Very few genes or samples") print("\n Fix these issues and try again.") return print("\nāœ… Data validation passed!") # Step 2: Create timestamped output directory print(f"\nšŸ“ Step 2: Creating output directory...") output_dir = create_timestamped_output_dir(output_base_dir, prefix="custom") print(f" Output: {output_dir}") print(f" This run ID: {output_dir.name}") # Step 3: Load data start_time = time.time() print(f"\nšŸ“‚ Step 3: Loading data...") if data_path_obj.is_file(): loader = DataLoader(data_path_obj.parent) data = loader.load_merged_data(data_path_obj.name) data_type = "merged" print(f" Loaded: {data.shape[0]} genes Ɨ {data.shape[1]} samples") else: loader = DataLoader(data_path) data_dict = loader.load_separate_data() data_type = "separate" print(f" Loaded {len(data_dict)} species files") # Step 4: Preprocess if normalize: print(f"\nšŸ”„ Step 4: Normalizing data...") if data_type == "merged": data = loader.normalize_data(data, method="zscore") print(" Applied z-score normalization") else: for species in data_dict: data_dict[species] = loader.normalize_data( data_dict[species], method="zscore" ) print(f" Applied z-score normalization to all {len(data_dict)} species") # Step 5: Build tensor print(f"\nšŸ”Ø Step 5: Building tensor...") builder = TensorBuilder() if data_type == "merged": tensor = builder.from_merged_data(data) else: tensor = builder.from_separate_data(data_dict) metadata = builder.get_metadata() print(f" Tensor shape: {tensor.shape}") print(f" Dimensions: Genes={tensor.shape[0]}, Individuals={tensor.shape[1]}, ") print(f" Time={tensor.shape[2]}, Species={tensor.shape[3]}") # Step 6: Fit model print(f"\nšŸš€ Step 6: Fitting CP decomposition (rank={rank})...") model = CPDecomposition(rank=rank, random_state=42) model.fit(tensor, n_iter_max=100, verbose=True) # Step 7: Compute metrics print(f"\nšŸ“Š Step 7: Computing performance metrics...") factors = model.get_factors() error = model.compute_reconstruction_error(tensor) reconstructed = model.reconstruct() explained_var = compute_explained_variance(tensor, reconstructed) runtime = time.time() - start_time print(f"\n Results:") print(f" ā”œā”€ Reconstruction error: {error:.6f}") print(f" ā”œā”€ Explained variance: {explained_var:.4f} ({explained_var*100:.2f}%)") print(f" └─ Runtime: {runtime:.1f}s") # Step 8: Save results print(f"\nšŸ’¾ Step 8: Saving results...") save_results(factors, metadata, output_dir, prefix="cp") print(f" āœ… Factor matrices saved") print(f" āœ… Metadata saved") # Step 9: Generate comprehensive report print(f"\nšŸ“ Step 9: Generating analysis report...") reporter = AnalysisReport(output_dir) report_path = reporter.generate_run_report( method="CP", tensor_shape=tensor.shape, rank=rank, reconstruction_error=error, explained_variance=explained_var, runtime_seconds=runtime, metadata=metadata, factors=factors ) print(f" āœ… Report generated: {report_path.name}") # Step 10: Summary print("\n" + "=" * 70) print("āœ… Analysis Complete!") print("=" * 70) print(f"\nšŸ“ All results saved to: {output_dir}") print(f"\nšŸ“„ Output files:") print(f" ā”œā”€ ANALYSIS_README.md - Comprehensive analysis report") print(f" ā”œā”€ cp_factor_genes.csv - Gene loadings ({tensor.shape[0]} Ɨ {rank})") print(f" ā”œā”€ cp_factor_individuals.csv - Individual patterns") print(f" ā”œā”€ cp_factor_timepoints.csv - Temporal dynamics") print(f" ā”œā”€ cp_factor_species.csv - Species-specific patterns") print(f" └─ cp_metadata.json - Dimension labels") print(f"\nšŸ“– Next Steps:") print(f" 1. Read: {output_dir}/ANALYSIS_README.md") print(f" This file explains:") print(f" • How to interpret each output file") print(f" • Optimal parameter recommendations") print(f" • Quality assessment of your results") print(f" • Troubleshooting tips") print(f"\n 2. Explore factor matrices:") print(f" import pandas as pd") print(f" genes = pd.read_csv('{output_dir}/cp_factor_genes.csv', index_col=0)") print(f" print(genes.head())") print(f"\n 3. Extract marker genes for component 0:") print(f" top_genes = genes['component_0'].abs().nlargest(50)") print(f"\n 4. Compare results across runs:") print(f" Each run is timestamped, so you can:") print(f" • Try different ranks (5, 10, 15, 20)") print(f" • Test normalization methods") print(f" • Compare Tucker vs CP") print("\n" + "=" * 70) def main(): """Main function.""" # Example 1: Use provided sample data print("=" * 70) print("Example 1: Analyzing provided sample data") print("=" * 70) sample_data = Path(__file__).parent.parent / "data" / "merge-vst" / "vst_counts_matrix.csv" if sample_data.exists(): analyze_custom_data( data_path=str(sample_data), output_base_dir="results/custom_example", rank=10, normalize=True ) else: print(f"\nāš ļø Sample data not found at {sample_data}") print(" Using your own data instead...") # Example 2: Guide for using your own data print("\n\n" + "=" * 70) print("Using Your Own Data") print("=" * 70) print(""" To analyze your own data, use one of these approaches: **Option 1: Command Line** ```bash stdm-fit --data-path /path/to/your/data.csv \\ --method cp --rank 15 --normalize \\ --output-dir results/my_analysis ``` **Option 2: Python API** ```python from stdm import DataValidator, create_timestamped_output_dir # First, validate your data validator = DataValidator() is_valid, messages = validator.validate_merged_data("your_data.csv") validator.print_report() # Then run analysis analyze_custom_data( data_path="your_data.csv", output_base_dir="results/my_project", rank=15 ) ``` **Your Data Format Should Be:** Merged CSV: - Rows: Genes (with gene IDs as row names) - Columns: Samples (format: SPECIES-INDIVIDUAL-TIMEPOINT) - Values: Normalized expression (numeric) Example: ``` group_id,ACR-001-TP1,ACR-001-TP2,POR-002-TP1,... GENE001,5.2,6.1,4.8,... GENE002,7.3,8.2,6.9,... ``` Separate Directory: - Multiple CSV files, one per species - File naming: species_normalized_expression.csv - Same format as merged, but one species per file **Need Help?** - Run with --validate-only to check your data format - Check GETTING_STARTED.md for detailed examples - See examples/custom_data_analysis.py (this file!) """) if __name__ == "__main__": main()