Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from sklearn.preprocessing import QuantileTransformer
- import statsmodels.api as sm
- from statsmodels.stats.multitest import multipletests
- import seaborn as sns
- import matplotlib.pyplot as plt
- from sklearn.decomposition import PCA
- # Step 1: Data Import and Setup
- def load_and_merge_data(expression_file, metadata_file):
- # Load data
- expression_data = pd.read_csv(expression_file)
- sample_metadata = pd.read_csv(metadata_file)
- # Merge data on a common identifier
- merged_data = expression_data.merge(sample_metadata, on="SampleID")
- print("Merged Data Preview:")
- print(merged_data.head())
- return expression_data, sample_metadata, merged_data
- # Step 2: Quality Control
- def quality_control(expression_data, min_counts=10, min_samples=5):
- # Filter genes with insufficient counts
- gene_filter = (expression_data.iloc[:, 1:] >= min_counts).sum(axis=1) >= min_samples
- filtered_data = expression_data[gene_filter]
- # Handle missing data
- filtered_data = filtered_data.fillna(0)
- print("Filtered Expression Data Preview:")
- print(filtered_data.head())
- return filtered_data
- # Step 3: Data Normalization
- def normalize_data(filtered_expression_data):
- qt = QuantileTransformer(output_distribution='normal', random_state=42)
- normalized_data = pd.DataFrame(
- qt.fit_transform(filtered_expression_data.iloc[:, 1:]),
- columns=filtered_expression_data.columns[1:],
- index=filtered_expression_data.index
- )
- normalized_data.insert(0, "SampleID", filtered_expression_data["SampleID"])
- print("Normalized Expression Data Preview:")
- print(normalized_data.head())
- return normalized_data
- # Step 4: Statistical Analysis
- def differential_expression_analysis(normalized_data, sample_metadata):
- condition = sample_metadata['Condition']
- design_matrix = pd.get_dummies(condition, drop_first=True)
- results = []
- for gene in normalized_data.columns[1:]:
- y = normalized_data[gene]
- model = sm.OLS(y, design_matrix)
- result = model.fit()
- results.append({
- "Gene": gene,
- "p-value": result.pvalues[0],
- "Coefficient": result.params[0]
- })
- results_df = pd.DataFrame(results)
- # Multiple testing correction
- results_df['Adjusted p-value'] = multipletests(results_df['p-value'], method='fdr_bh')[1]
- print("Differential Expression Results Preview:")
- print(results_df.head())
- return results_df
- # Step 5: Visualization
- def create_visualizations(normalized_data, sample_metadata):
- condition = sample_metadata['Condition']
- # Boxplot for a single gene
- sns.boxplot(x=condition, y=normalized_data['Gene1'])
- plt.title("Boxplot of Gene1 by Condition")
- plt.show()
- # PCA plot
- pca = PCA(n_components=2)
- pca_result = pca.fit_transform(normalized_data.iloc[:, 1:])
- pca_df = pd.DataFrame(pca_result, columns=["PC1", "PC2"])
- pca_df["Condition"] = condition
- sns.scatterplot(data=pca_df, x="PC1", y="PC2", hue="Condition")
- plt.title("PCA of Gene Expression")
- plt.show()
- # Step 6: Export Results
- def export_results(normalized_data, results_df, normalized_file, results_file):
- normalized_data.to_csv(normalized_file, index=False)
- results_df.to_csv(results_file, index=False)
- print(f"Results exported to:\n- {normalized_file}\n- {results_file}")
- # Main Workflow
- def main():
- # File paths (replace with actual file paths)
- expression_file = 'expression_data.csv'
- metadata_file = 'sample_metadata.csv'
- normalized_file = 'normalized_expression_data.csv'
- results_file = 'differential_expression_results.csv'
- # Workflow
- expression_data, sample_metadata, merged_data = load_and_merge_data(expression_file, metadata_file)
- filtered_data = quality_control(expression_data)
- normalized_data = normalize_data(filtered_data)
- results_df = differential_expression_analysis(normalized_data, sample_metadata)
- create_visualizations(normalized_data, sample_metadata)
- export_results(normalized_data, results_df, normalized_file, results_file)
- # Run the workflow
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement