Advertisement
UF6

GeoMX Analysis Workflow

UF6
Jan 13th, 2025
71
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.21 KB | Source Code | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. from sklearn.preprocessing import QuantileTransformer
  4. import statsmodels.api as sm
  5. from statsmodels.stats.multitest import multipletests
  6. import seaborn as sns
  7. import matplotlib.pyplot as plt
  8. from sklearn.decomposition import PCA
  9.  
  10. # Step 1: Data Import and Setup
  11. def load_and_merge_data(expression_file, metadata_file):
  12.     # Load data
  13.     expression_data = pd.read_csv(expression_file)
  14.     sample_metadata = pd.read_csv(metadata_file)
  15.  
  16.     # Merge data on a common identifier
  17.     merged_data = expression_data.merge(sample_metadata, on="SampleID")
  18.  
  19.     print("Merged Data Preview:")
  20.     print(merged_data.head())
  21.     return expression_data, sample_metadata, merged_data
  22.  
  23. # Step 2: Quality Control
  24. def quality_control(expression_data, min_counts=10, min_samples=5):
  25.     # Filter genes with insufficient counts
  26.     gene_filter = (expression_data.iloc[:, 1:] >= min_counts).sum(axis=1) >= min_samples
  27.     filtered_data = expression_data[gene_filter]
  28.  
  29.     # Handle missing data
  30.     filtered_data = filtered_data.fillna(0)
  31.  
  32.     print("Filtered Expression Data Preview:")
  33.     print(filtered_data.head())
  34.     return filtered_data
  35.  
  36. # Step 3: Data Normalization
  37. def normalize_data(filtered_expression_data):
  38.     qt = QuantileTransformer(output_distribution='normal', random_state=42)
  39.     normalized_data = pd.DataFrame(
  40.         qt.fit_transform(filtered_expression_data.iloc[:, 1:]),
  41.         columns=filtered_expression_data.columns[1:],
  42.         index=filtered_expression_data.index
  43.     )
  44.     normalized_data.insert(0, "SampleID", filtered_expression_data["SampleID"])
  45.  
  46.     print("Normalized Expression Data Preview:")
  47.     print(normalized_data.head())
  48.     return normalized_data
  49.  
  50. # Step 4: Statistical Analysis
  51. def differential_expression_analysis(normalized_data, sample_metadata):
  52.     condition = sample_metadata['Condition']
  53.     design_matrix = pd.get_dummies(condition, drop_first=True)
  54.  
  55.     results = []
  56.     for gene in normalized_data.columns[1:]:
  57.         y = normalized_data[gene]
  58.         model = sm.OLS(y, design_matrix)
  59.         result = model.fit()
  60.         results.append({
  61.             "Gene": gene,
  62.             "p-value": result.pvalues[0],
  63.             "Coefficient": result.params[0]
  64.         })
  65.  
  66.     results_df = pd.DataFrame(results)
  67.  
  68.     # Multiple testing correction
  69.     results_df['Adjusted p-value'] = multipletests(results_df['p-value'], method='fdr_bh')[1]
  70.  
  71.     print("Differential Expression Results Preview:")
  72.     print(results_df.head())
  73.     return results_df
  74.  
  75. # Step 5: Visualization
  76. def create_visualizations(normalized_data, sample_metadata):
  77.     condition = sample_metadata['Condition']
  78.  
  79.     # Boxplot for a single gene
  80.     sns.boxplot(x=condition, y=normalized_data['Gene1'])
  81.     plt.title("Boxplot of Gene1 by Condition")
  82.     plt.show()
  83.  
  84.     # PCA plot
  85.     pca = PCA(n_components=2)
  86.     pca_result = pca.fit_transform(normalized_data.iloc[:, 1:])
  87.     pca_df = pd.DataFrame(pca_result, columns=["PC1", "PC2"])
  88.     pca_df["Condition"] = condition
  89.  
  90.     sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="Condition")
  91.     plt.title("PCA of Gene Expression")
  92.     plt.show()
  93.  
  94. # Step 6: Export Results
  95. def export_results(normalized_data, results_df, normalized_file, results_file):
  96.     normalized_data.to_csv(normalized_file, index=False)
  97.     results_df.to_csv(results_file, index=False)
  98.     print(f"Results exported to:\n- {normalized_file}\n- {results_file}")
  99.  
  100. # Main Workflow
  101. def main():
  102.     # File paths (replace with actual file paths)
  103.     expression_file = 'expression_data.csv'
  104.     metadata_file = 'sample_metadata.csv'
  105.     normalized_file = 'normalized_expression_data.csv'
  106.     results_file = 'differential_expression_results.csv'
  107.  
  108.     # Workflow
  109.     expression_data, sample_metadata, merged_data = load_and_merge_data(expression_file, metadata_file)
  110.     filtered_data = quality_control(expression_data)
  111.     normalized_data = normalize_data(filtered_data)
  112.     results_df = differential_expression_analysis(normalized_data, sample_metadata)
  113.     create_visualizations(normalized_data, sample_metadata)
  114.     export_results(normalized_data, results_df, normalized_file, results_file)
  115.  
  116. # Run the workflow
  117. if __name__ == "__main__":
  118.     main()
  119.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement