diff --git a/.gitignore b/.gitignore index b980b94..a7b0ebe 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,6 @@ Scully_mannitol_2025/scRNA-seq_python_scripts/raw_unfiltered_data/* # Large files to exclude in Scully et al. Ciona atlas Scully_Ciona_blood_2025/data/* !Scully_Ciona_blood_2025/data/README.md -Scully_Ciona_blood_2025/scRNA-seq_analysis/go_enrichment_analysis/go_enrichment_analysis_output/* Scully_Ciona_blood_2025/scRNA-seq_analysis/gene_enrichment_analysis/curated_gene_sets_fishers_output/* Scully_Ciona_blood_2025/scRNA-seq_analysis/gene_enrichment_analysis/curated_gene_sets_gsea_output/* Scully_Ciona_blood_2025/scRNA-seq_analysis/gene_enrichment_analysis/kegg_pathway_fishers_output/* @@ -18,6 +17,10 @@ Scully_Ciona_blood_2025/scRNA-seq_analysis/gene_enrichment_analysis/kegg_pathway Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/* Scully_Ciona_blood_2025/species_comparisons/gs_plots/gs_plots_Hs_vs_Cr_output/* Scully_Ciona_blood_2025/species_comparisons/gs_plots/gs_plots_Hs_vs_Dr_output/* +Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/coexpr_cons_Hs_vs_Cr_output/* +Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/coexpr_cons_Hs_vs_Dr_output/* +Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/tf_coexpr_Hs_vs_Cr_output/* +Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/tf_coexpr_Hs_vs_Dr_output/* Scully_Ciona_blood_2025/HCR_FISH/napari_image_analysis_pipeline/raw_images/* !Scully_Ciona_blood_2025/HCR_FISH/napari_image_analysis_pipeline/raw_images/SAMPLE_INFO.csv Scully_Ciona_blood_2025/HCR_FISH/napari_image_analysis_pipeline/image_analysis_pipeline_output/* diff --git a/Scully_Ciona_blood_2025/HCR_FISH/napari_image_analysis_pipeline/README.md b/Scully_Ciona_blood_2025/HCR_FISH/napari_image_analysis_pipeline/README.md index 9958902..b78339b 100644 --- a/Scully_Ciona_blood_2025/HCR_FISH/napari_image_analysis_pipeline/README.md +++ b/Scully_Ciona_blood_2025/HCR_FISH/napari_image_analysis_pipeline/README.md @@ -2,7 +2,7 @@ The scripts in this folder were used to analyze images for HCR FISH, specificall Key files in this folder: - `image_analysis_pipeline.py` runs the image analysis pipeline. This readme describes how to run it, and what file structure it expects for raw image files. -- `napari_image_analysis_env.txt` contains information on the conda environment used, saved to a text file (by running `conda list --explicit > napari_image_analysis_env.txt`). +- `napari_image_analysis_env.yml` and `napari_image_analysis_env.txt` contain information on the conda environment used, saved to a text file (by running `conda env export --no-builds > napari_image_analysis_env.yml` and `conda list --explicit > napari_image_analysis_env.txt`). The .txt files is for MacOS only. - `raw_images/` is a folder which must contain the raw imaging files to successfully run `image_analysis_pipeline.py`. See "[Subfolder structure](#subfolder-structure)" for more detail. - `HCR_PROBES_MASTER_LIST.xlsx` is an excel spreadsheet containing a conversion between probe IDs used in python scripts (e.g. HP14) and genes labeled (e.g. KY21.Chr11.687). This is needed to run `image_analysis_pipeline.py`. diff --git a/Scully_Ciona_blood_2025/HCR_FISH/napari_image_analysis_pipeline/napari_image_analysis_env.yml b/Scully_Ciona_blood_2025/HCR_FISH/napari_image_analysis_pipeline/napari_image_analysis_env.yml new file mode 100644 index 0000000..b2dfada --- /dev/null +++ b/Scully_Ciona_blood_2025/HCR_FISH/napari_image_analysis_pipeline/napari_image_analysis_env.yml @@ -0,0 +1,187 @@ +name: napari-env +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - bzip2=1.0.8 + - ca-certificates=2021.10.8 + - libffi=3.4.2 + - libzlib=1.2.11 + - ncurses=6.3 + - openssl=3.0.0 + - pip=22.0.4 + - python=3.9.10 + - python_abi=3.9 + - readline=8.1 + - setuptools=60.9.3 + - sqlite=3.37.0 + - tk=8.6.12 + - tzdata=2021e + - wheel=0.37.1 + - xz=5.2.5 + - zlib=1.2.11 + - pip: + - absl-py==1.4.0 + - alabaster==0.7.12 + - appdirs==1.4.4 + - appnope==0.1.2 + - asttokens==2.0.5 + - astunparse==1.6.3 + - attrs==21.4.0 + - babel==2.9.1 + - backcall==0.2.0 + - cachetools==5.3.0 + - cachey==0.2.1 + - certifi==2021.10.8 + - charset-normalizer==2.0.12 + - click==8.0.4 + - cloudpickle==2.0.0 + - contourpy==1.0.7 + - csbdeep==0.7.3 + - cycler==0.11.0 + - dask==2022.2.1 + - debugpy==1.5.1 + - decorator==5.1.1 + - docstring-parser==0.13 + - docutils==0.17.1 + - entrypoints==0.4 + - et-xmlfile==1.1.0 + - executing==0.8.3 + - flatbuffers==23.3.3 + - fonttools==4.38.0 + - freetype-py==2.2.0 + - fsspec==2022.2.0 + - gast==0.4.0 + - google-auth==2.17.0 + - google-auth-oauthlib==0.4.6 + - google-pasta==0.2.0 + - grpcio==1.53.0 + - h5py==3.8.0 + - heapdict==1.0.1 + - hsluv==5.0.2 + - idna==3.3 + - imagecodecs==2023.3.16 + - imageio==2.16.1 + - imagesize==1.3.0 + - imglyb==2.0.1 + - importlib-metadata==4.11.2 + - intervaltree==3.1.0 + - ipykernel==6.9.1 + - ipython==8.1.1 + - ipython-genutils==0.2.0 + - jax==0.4.8 + - jedi==0.18.1 + - jgo==1.0.5 + - jinja2==3.0.3 + - joblib==1.2.0 + - jpype1==1.4.1 + - jsonschema==4.4.0 + - jupyter-client==7.1.2 + - jupyter-core==4.9.2 + - keras==2.12.0 + - kiwisolver==1.3.2 + - labeling==0.1.13 + - libclang==16.0.0 + - llvmlite==0.39.1 + - locket==0.2.1 + - magicgui==0.3.7 + - markdown==3.4.3 + - markupsafe==2.1.2 + - matplotlib==3.6.3 + - matplotlib-inline==0.1.3 + - matplotlib-scalebar==0.8.1 + - ml-dtypes==0.0.4 + - napari==0.4.14 + - napari-console==0.0.4 + - napari-plugin-engine==0.2.0 + - napari-svg==0.1.6 + - nd2==0.5.3 + - nd2reader==3.3.0 + - nest-asyncio==1.5.4 + - networkx==2.7.1 + - npe2==0.1.2 + - numba==0.56.4 + - numpy==1.22.3 + - numpydoc==1.2 + - oauthlib==3.2.2 + - openpyxl==3.1.2 + - opt-einsum==3.3.0 + - packaging==21.3 + - pandas==1.4.1 + - parso==0.8.3 + - partd==1.2.0 + - pexpect==4.8.0 + - pickleshare==0.7.5 + - pillow==9.0.1 + - pims==0.6.1 + - pint==0.18 + - pooch==1.6.0 + - prompt-toolkit==3.0.28 + - protobuf==4.22.1 + - psutil==5.9.0 + - psygnal==0.3.3 + - ptyprocess==0.7.0 + - pure-eval==0.2.2 + - pyasn1==0.4.8 + - pyasn1-modules==0.2.8 + - pydantic==1.9.0 + - pygments==2.11.2 + - pyopengl==3.1.6 + - pyparsing==3.0.7 + - pyqt5==5.15.6 + - pyqt5-qt5==5.15.2 + - pyqt5-sip==12.9.1 + - pyrsistent==0.18.1 + - python-dateutil==2.8.2 + - pytomlpp==1.0.10 + - pytz==2021.3 + - pywavelets==1.2.0 + - pyyaml==6.0 + - pyzmq==22.3.0 + - qtconsole==5.2.2 + - qtpy==2.0.1 + - requests==2.27.1 + - requests-oauthlib==1.3.1 + - resource-backed-dask-array==0.1.0 + - rsa==4.9 + - scikit-image==0.19.2 + - scikit-learn==1.2.1 + - scipy==1.8.0 + - scyjava==1.8.1 + - seaborn==0.12.2 + - six==1.16.0 + - slicerator==1.1.0 + - snowballstemmer==2.2.0 + - sortedcontainers==2.4.0 + - sphinx==4.4.0 + - sphinxcontrib-applehelp==1.0.2 + - sphinxcontrib-devhelp==1.0.2 + - sphinxcontrib-htmlhelp==2.0.0 + - sphinxcontrib-jsmath==1.0.1 + - sphinxcontrib-qthelp==1.0.3 + - sphinxcontrib-serializinghtml==1.1.5 + - stack-data==0.2.0 + - superqt==0.3.1 + - tensorboard==2.12.0 + - tensorboard-data-server==0.7.0 + - tensorboard-plugin-wit==1.8.1 + - tensorflow-estimator==2.12.0 + - tensorflow-io-gcs-filesystem==0.32.0 + - termcolor==2.2.0 + - threadpoolctl==3.1.0 + - tifffile==2022.2.9 + - toolz==0.11.2 + - tornado==6.1 + - tqdm==4.63.0 + - traitlets==5.1.1 + - typer==0.4.0 + - typing-extensions==4.1.1 + - urllib3==1.26.8 + - vispy==0.9.6 + - wcwidth==0.2.5 + - werkzeug==2.2.3 + - wrapt==1.13.3 + - xarray==2023.2.0 + - xmltodict==0.13.0 + - zipp==3.7.0 diff --git a/Scully_Ciona_blood_2025/README.md b/Scully_Ciona_blood_2025/README.md index 00d11af..9767495 100644 --- a/Scully_Ciona_blood_2025/README.md +++ b/Scully_Ciona_blood_2025/README.md @@ -26,3 +26,4 @@ Contains several helper function files used in scripts throughout the folder. ### Subfolder `cross_species_comparisons/` - `gs_plots/` contains scripts for making generalized Sankey (GS) plots and calculating Jaccard similarity scores. +- `coexpression_conservation/` contains scripts for quantifying gene coexpression conservation. diff --git a/Scully_Ciona_blood_2025/helper_functions/gillis_style_coexpression_hf.py b/Scully_Ciona_blood_2025/helper_functions/gillis_style_coexpression_hf.py index e47d8ec..d1125c1 100644 --- a/Scully_Ciona_blood_2025/helper_functions/gillis_style_coexpression_hf.py +++ b/Scully_Ciona_blood_2025/helper_functions/gillis_style_coexpression_hf.py @@ -9,8 +9,6 @@ # Change this path to point to folder containing gene_hf.py # This imports dictionaries and functions for easily converting gene ids -path_to_dropbox = os.environ['PATH_TO_DROPBOX'] -sys.path.append(path_to_dropbox + 'klein_lab/resources/helper_functions') import scrna_helper_functions as hf # ============================================================================ diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/README.md b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/README.md index 152ae22..57f8110 100644 --- a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/README.md +++ b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/README.md @@ -2,4 +2,4 @@ The scripts in this folder were used to generate a hypothesized hematopoietic hi The files are: - `differentiation_graph.py` is the python script that uses the scRNA-seq data to generate a hypothesized hierarchy. The script outputs a folder `differentiation_graph_output/` containing plots of the inferred hierarchy (graph, adjacency matrix), a text file listing cell states not connected to other nodes in the graph, as well as miscellaneous supporting plots. -- `differentiation_graph_env.txt` contains information on the conda environment used, saved to a text file (by running `conda list --explicit > differentiation_graph_env.txt`). +- `differentiation_graph_env.yml` and `differentiation_graph_env.txt` contain information on the conda environment used, saved to a text file (by running `conda env export --no-builds > differentiation_graph_env.yml` and `conda list --explicit > differentiation_graph_env.txt`). The .txt files is for MacOS only. diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_env.yml b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_env.yml new file mode 100644 index 0000000..f9d5f99 --- /dev/null +++ b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_env.yml @@ -0,0 +1,173 @@ +name: sc_env_graphs +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - appnope=0.1.2 + - asttokens=2.4.1 + - backcall=0.2.0 + - ca-certificates=2024.8.30 + - certifi=2024.8.30 + - executing=2.1.0 + - freetype=2.10.4 + - graphviz=2.42.3 + - ipykernel=5.3.4 + - ipython_genutils=0.2.0 + - jpeg=9e + - jupyter_client=6.1.12 + - jupyter_core=4.7.1 + - libcxx=10.0.0 + - libffi=3.3 + - libpng=1.6.37 + - libsodium=1.0.18 + - libtiff=4.1.0 + - libwebp-base=1.1.0 + - lz4-c=1.9.2 + - matplotlib-inline=0.1.2 + - ncurses=6.2 + - openssl=1.1.1w + - parso=0.8.2 + - pexpect=4.8.0 + - pickleshare=0.7.5 + - pip=21.1.3 + - prompt_toolkit=3.0.48 + - ptyprocess=0.7.0 + - pure_eval=0.2.3 + - pygments=2.9.0 + - pygraphviz=1.7 + - python=3.8.10 + - python-dateutil=2.8.2 + - python_abi=3.8 + - pyzmq=20.0.0 + - readline=8.1 + - setuptools=52.0.0 + - six=1.16.0 + - sqlite=3.36.0 + - stack_data=0.6.2 + - tk=8.6.10 + - tornado=6.1 + - traitlets=5.0.5 + - typing_extensions=4.12.2 + - wcwidth=0.2.5 + - wheel=0.36.2 + - xz=5.2.5 + - zeromq=4.3.4 + - zlib=1.2.11 + - zstd=1.4.5 + - pip: + - anndata==0.7.6 + - annoy==1.17.0 + - anyio==3.4.0 + - argon2-cffi==21.1.0 + - astroid==2.13.3 + - attrs==21.2.0 + - babel==2.9.1 + - bbknn==1.6.0 + - bleach==4.1.0 + - blosc2==2.0.0 + - cffi==1.15.0 + - charset-normalizer==2.0.7 + - cycler==0.10.0 + - cython==0.29.24 + - decorator==4.4.2 + - defusedxml==0.7.1 + - dill==0.3.6 + - entrypoints==0.3 + - et-xmlfile==1.1.0 + - fastcluster==1.2.6 + - fbpca==1.0 + - geosketch==1.2 + - h5py==3.3.0 + - harmonypy==0.0.9 + - idna==3.3 + - imageio==2.9.0 + - importlib-metadata==8.4.0 + - importlib-resources==5.4.0 + - intervaltree==3.1.0 + - ipython==7.25.0 + - isort==5.11.4 + - jedi==0.18.0 + - jinja2==3.0.3 + - joblib==1.2.0 + - json5==0.9.6 + - jsonschema==4.2.1 + - jupyter-server==1.12.0 + - jupyterlab==3.2.4 + - jupyterlab-pygments==0.1.2 + - jupyterlab-server==2.8.2 + - kiwisolver==1.3.1 + - lazy-object-proxy==1.9.0 + - leidenalg==0.8.7 + - llvmlite==0.41.1 + - louvain==0.7.0 + - lxml==4.8.0 + - markupsafe==2.0.1 + - matplotlib==3.4.2 + - matplotlib-venn==0.11.9 + - mccabe==0.7.0 + - mistune==0.8.4 + - msgpack==1.0.8 + - natsort==7.1.1 + - nbclassic==0.3.4 + - nbclient==0.5.9 + - nbconvert==6.3.0 + - nbformat==5.1.3 + - nest-asyncio==1.5.1 + - networkx==2.5.1 + - notebook==6.4.6 + - numba==0.58.1 + - numexpr==2.7.3 + - numpy==1.24.4 + - openpyxl==3.1.2 + - packaging==21.0 + - pandas==1.3.0 + - pandocfilters==1.5.0 + - patsy==0.5.6 + - pillow==8.3.1 + - platformdirs==2.6.2 + - prometheus-client==0.12.0 + - prompt-toolkit==3.0.19 + - py-cpuinfo==9.0.0 + - pycparser==2.21 + - pydot==3.0.2 + - pylint==2.15.10 + - pynndescent==0.5.4 + - pyparsing==3.1.4 + - pyrsistent==0.18.0 + - python-igraph==0.9.6 + - python-louvain==0.16 + - pytz==2021.1 + - pywavelets==1.1.1 + - requests==2.26.0 + - scanorama==1.7.3 + - scanpy==1.8.1 + - scikit-image==0.18.2 + - scikit-learn==1.2.1 + - scipy==1.10.1 + - scrublet==0.2.3 + - seaborn==0.11.1 + - send2trash==1.8.0 + - sinfo==0.3.4 + - sniffio==1.2.0 + - sortedcontainers==2.4.0 + - statsmodels==0.13.1 + - stdlib-list==0.8.0 + - tables==3.8.0 + - terminado==0.12.1 + - testpath==0.5.0 + - texttable==1.6.4 + - threadpoolctl==2.2.0 + - tifffile==2021.8.8 + - tomli==2.0.1 + - tomlkit==0.11.6 + - tqdm==4.61.2 + - typing-extensions==4.4.0 + - umap-learn==0.5.1 + - urllib3==1.26.7 + - vireosnp==0.5.8 + - webencodings==0.5.1 + - websocket-client==1.2.1 + - wrapt==1.14.1 + - xlrd==1.2.0 + - zipp==3.6.0 diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_1.pdf b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_1.pdf new file mode 100644 index 0000000..eec8b2a Binary files /dev/null and b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_1.pdf differ diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_2.pdf b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_2.pdf new file mode 100644 index 0000000..dbfb478 Binary files /dev/null and b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_2.pdf differ diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_3.pdf b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_3.pdf new file mode 100644 index 0000000..d12a52e Binary files /dev/null and b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_3.pdf differ diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_adjacency_matrix.pdf b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_adjacency_matrix.pdf new file mode 100644 index 0000000..de436c9 Binary files /dev/null and b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/cell_type_graph_adjacency_matrix.pdf differ diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/connections_heatmap_all_to_all.pdf b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/connections_heatmap_all_to_all.pdf new file mode 100644 index 0000000..15d03aa Binary files /dev/null and b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/connections_heatmap_all_to_all.pdf differ diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/connections_heatmap_prog_to_mature.pdf b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/connections_heatmap_prog_to_mature.pdf new file mode 100644 index 0000000..ac71702 Binary files /dev/null and b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/connections_heatmap_prog_to_mature.pdf differ diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/edge_weight_distribution.pdf b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/edge_weight_distribution.pdf new file mode 100644 index 0000000..dd19b3f Binary files /dev/null and b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/edge_weight_distribution.pdf differ diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/mki67_expr_per_cell_type.png b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/mki67_expr_per_cell_type.png new file mode 100644 index 0000000..4b7411d Binary files /dev/null and b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/mki67_expr_per_cell_type.png differ diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/nodes_without_edges.txt b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/nodes_without_edges.txt new file mode 100644 index 0000000..df7a247 --- /dev/null +++ b/Scully_Ciona_blood_2025/scRNA-seq_analysis/differentiation_hierarchy_analysis/differentiation_graph_output/nodes_without_edges.txt @@ -0,0 +1,8 @@ +GA +LGH/MC +ND-5 +ND-6 +RA-2 +SGH-1 +SGH-2 +SRC \ No newline at end of file diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/go_enrichment_analysis/README.md b/Scully_Ciona_blood_2025/scRNA-seq_analysis/go_enrichment_analysis/README.md deleted file mode 100644 index e52f751..0000000 --- a/Scully_Ciona_blood_2025/scRNA-seq_analysis/go_enrichment_analysis/README.md +++ /dev/null @@ -1,5 +0,0 @@ -The scripts in this folder were used to run Gene Ontology (GO) enrichment analysis. See methods section "Gene Ontology Enrichment Analysis". - -The files are: -- `go_enrichment_analysis.py` is the python script that performs GO analysis. The script outputs a folder `go_enrichment_analysis_output/` containing tsv files with results from Fisher's exact test of GO term enrichment for each cell state's enriched DEGs. -- `go_enrichment_env.txt` contains information on the conda environment used, saved to a text file (by running `conda list --explicit > go_enrichment_env.txt`). diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/go_enrichment_analysis/go_enrichment_analysis.py b/Scully_Ciona_blood_2025/scRNA-seq_analysis/go_enrichment_analysis/go_enrichment_analysis.py deleted file mode 100644 index 3e73c14..0000000 --- a/Scully_Ciona_blood_2025/scRNA-seq_analysis/go_enrichment_analysis/go_enrichment_analysis.py +++ /dev/null @@ -1,247 +0,0 @@ -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns -import scanpy as sc -import gseapy as gp -import sys -import os -from scipy.stats import fisher_exact -from statsmodels.stats.multitest import fdrcorrection -from tqdm import tqdm - -# Change this path to point to folder containing helper functions scripts -path_to_repo_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -sys.path.append(os.path.join(path_to_repo_dir, 'helper_functions')) -import scrna_helper_functions as hf - -# ============================================================================ - - -def reorder_matrix(centr, row_order='none', - col_order='none', - method_row='single', metric_row='cosine', - method_col='single', metric_col='cosine', - count_sort_row='descending', - count_sort_col='ascending', - save_as=''): - # For hierarchical clustering, use a matrix with no nan or inf - centr2 = centr.copy() - centr2.replace(np.inf, 300, inplace=True) - centr2.replace(np.nan, 300, inplace=True) - - # 1. REORDER CELL TYPES - # a. hierarchical clustering - if row_order == 'hierarchical_clustering': - with plt.style.context('tal_paper'): - f = plt.figure(figsize=(3, 3)) - ax = plt.subplot(1, 1, 1) - dendr = hf.hierarchical_clustering( - centr2, ax, optimal_ordering=True, - labels=centr.index, count_sort=count_sort_row, - method=method_row, metric=metric_row, - ) - plt.tight_layout() - plt.savefig(out_path + f'{save_as}_cell_dendrogram.pdf') - plt.close() - - cluster_order = dendr['ivl'][::-1] - centr = centr.loc[cluster_order] - - # b. manual order - elif isinstance(row_order, list): - centr = centr.loc[row_order] - - # c. descending row mean - elif row_order == 'descending_row_mean': - row_order_idx = np.argsort(centr.mean(axis=1))[::-1] - row_order = centr.index[row_order_idx] - centr = centr.loc[row_order] - - # 1. REORDER GENES - # a. hierarchical clustering - if col_order == 'hierarchical_clustering': - with plt.style.context('tal_paper'): - f = plt.figure(figsize=(3, 3)) - ax = plt.subplot(1, 1, 1) - dendr = hf.hierarchical_clustering( - centr2.T, ax, optimal_ordering=True, - labels=centr.columns, count_sort=count_sort_col, - method=method_col, metric=metric_col, - ) - plt.tight_layout() - plt.savefig(out_path + f'{save_as}_gene_dendrogram.pdf') - plt.close() - - # For gene ordering, put not expressed genes at the end - col_order = dendr['ivl'][::-1] - centr = centr[col_order] - - # b. manual order - elif isinstance(col_order, list): - centr = centr.loc[col_order] - - # c. descending row mean - elif col_order == 'descending_row_mean': - col_order_idx = np.argsort(centr.mean(axis=1))[::-1] - col_order = centr.index[col_order_idx] - centr = centr.loc[col_order] - - return centr - - -# ============================================================================ -# IMPORT DATA - -# Import counts matrix -adata = sc.read_h5ad(hf.path.Crob_adata_file) - -# Set up output folder -out_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), - 'go_enrichment_analysis_output') -if not os.path.exists(out_path): os.mkdir(out_path) - -# ============================================================================ -# GO term gene sets with Ciona genes - -for go_gene_sets in ['GO_Molecular_Function_2023', 'GO_Biological_Process_2023', - 'GO_Cellular_Component_2023']: - print(go_gene_sets) - - go_human = gp.get_library(name=go_gene_sets, organism='human') - - # GO terms to look at - go_list = [] - - go_ciona = {} - for term in tqdm(go_human): - go_ciona[term] = [] - this_gene_set = go_human[term] - - # Track how many human genes have orthologs - count_human_genes_with_orthologs = 0 - - for human_g in this_gene_set: - # If there is >1 ortholog... - if human_g in hf.human2ciona: - # ...increment count of human genes with orthologs - count_human_genes_with_orthologs += 1 - - # ...and add each ciona ortholog to the gene set (without repeats) - ciona_g_list = hf.human2ciona[human_g] - for ciona_g in ciona_g_list: - if ciona_g not in go_ciona[term]: - go_ciona[term].append(ciona_g) - - # If >=10 human genes have orthologs - if count_human_genes_with_orthologs >= 10: - # AND if >=10 ciona genes in final list - if len(go_ciona[term]) >= 10: - go_list.append(term) - - - # ============================================================================ - # Differentially expressed genes (Wilcoxon rank-sum test) - - # Wilcoxon rank-sum test - sc.tl.rank_genes_groups(adata, groupby='cell_type') - - # ============================================================================ - # HYPERGEOMETRIC TEST / FISHER'S EXACT TEST - - try: - fisher_results = pd.read_csv(os.path.join(out_path, 'fisher_results.tsv'), - sep='\t') - fisher_results['Cell type'] = fisher_results['Cell type'].astype('category') - fisher_results['GO term'] = fisher_results['GO term'].astype('category') - - except: - - # Cell types to look at - cell_type_list = adata.obs['cell_type'].cat.categories - - # Define the world (all genes which could be in a set) - gene_bool = np.array(np.sum(adata.raw.X != 0, axis=0) > 20).flatten() - # gene_bool = adata.var['highly_variable'] - full_set = adata.var.index[gene_bool] - - # Initialize pandas dataframe to save info - fisher_results = { - 'Cell type': [], - 'GO term': [], - 'Enrichment': [], - 'p-value': [], - 'num_genes_overlap': [], - 'num_genes_in_GO_set': [], - } - - # Loop through every pair of clusters / cell types - count = 0 - for i, cell_type in enumerate(cell_type_list): - print(f'{cell_type} ({i}/{len(cell_type_list)})') - for go_term in tqdm(go_list): - - # Get lists of genes - genes_go = set(go_ciona[go_term]) - - # Get list of genes for this cell type - wilcoxon_df = sc.get.rank_genes_groups_df(adata, group=cell_type) - genes_c = set(wilcoxon_df.loc[ - ( - (wilcoxon_df['logfoldchanges'] > 1) - * (wilcoxon_df['pvals_adj']< 0.01) - ), - 'names' - ].values) - - # Filter so gene sets not in "world" are ignored - genes_go = {g for g in genes_go if g in full_set} - genes_c = {g for g in genes_c if g in full_set} - - # Create contingency table - # Yes Ciona cell type and yes GO term - a = len(genes_c.intersection(genes_go)) - # Yes Ciona cell type and not GO term - b = len(genes_c.difference(genes_go)) - # Not Ciona cell type and yes GO term - c = len(genes_go.difference(genes_c)) - # Not Ciona cell type and not GO term - # a + b + c + d = len(full_set) - d = len(full_set) - a - b - c - - contingency_table = np.array([[a, b], [c, d]]) - - # Perform Fisher's exact test - [enrichment, pval] = fisher_exact(contingency_table, - alternative='greater') - # fisher_results.loc[count, 'Cell type'] = cell_type - # fisher_results.loc[count, 'GO term'] = go_term - # fisher_results.loc[count, 'Enrichment'] = enrichment - # fisher_results.loc[count, 'p-value'] = pval - fisher_results['Cell type'].append(cell_type) - fisher_results['GO term'].append(go_term) - fisher_results['Enrichment'].append(enrichment) - fisher_results['p-value'].append(pval) - fisher_results['num_genes_overlap'].append(a) - fisher_results['num_genes_in_GO_set'].append(a + c) - - count += 1 - - fisher_results = pd.DataFrame(fisher_results) - fisher_results['Cell type'] = fisher_results['Cell type'].astype('category') - fisher_results['GO term'] = fisher_results['GO term'].astype('category') - - # -------------------------------- - # MULTIPLE HYPOTHESIS CORRECTION - - # Run FDR Benjamini-Hochberg correction - [rejected, pval_fdr] = fdrcorrection(fisher_results['p-value'], - alpha=0.05) - - fisher_results['null rejected'] = rejected - fisher_results['FDR'] = pval_fdr - - # Save results - fisher_results[fisher_results['null rejected']].to_csv( - os.path.join(out_path, f'{go_gene_sets}_fisher_results.tsv'), sep='\t' - ) diff --git a/Scully_Ciona_blood_2025/scRNA-seq_analysis/go_enrichment_analysis/go_enrichment_env.txt b/Scully_Ciona_blood_2025/scRNA-seq_analysis/go_enrichment_analysis/go_enrichment_env.txt deleted file mode 100644 index 3bc6e2f..0000000 --- a/Scully_Ciona_blood_2025/scRNA-seq_analysis/go_enrichment_analysis/go_enrichment_env.txt +++ /dev/null @@ -1,72 +0,0 @@ -# This file may be used to create an environment using: -# $ conda create --name --file -# platform: osx-64 -@EXPLICIT -https://repo.anaconda.com/pkgs/main/osx-64/bzip2-1.0.8-h6c40b1e_6.conda -https://repo.anaconda.com/pkgs/main/osx-64/c-ares-1.19.1-h6c40b1e_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/ca-certificates-2024.7.2-hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/libcxx-14.0.6-h9765a3e_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/libev-4.33-h9ed2024_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/libffi-3.4.4-hecd8cb5_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/libiconv-1.16-h6c40b1e_3.conda -https://repo.anaconda.com/pkgs/main/osx-64/ncurses-6.4-hcec6c5f_0.conda -https://conda.anaconda.org/conda-forge/noarch/pybind11-abi-4-hd8ed1ab_3.tar.bz2 -https://repo.anaconda.com/pkgs/main/osx-64/reproc-14.2.4-hcec6c5f_2.conda -https://repo.anaconda.com/pkgs/main/noarch/tzdata-2024a-h04d1e81_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/xz-5.4.6-h6c40b1e_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/zlib-1.2.13-h4b97444_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/fmt-9.1.0-ha357a0b_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/icu-73.1-hcec6c5f_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/libedit-3.1.20230828-h6c40b1e_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/lz4-c-1.9.4-hcec6c5f_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/openssl-3.0.14-h46256e1_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/pcre2-10.42-h9b97e30_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/readline-8.2-hca72f7f_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/reproc-cpp-14.2.4-hcec6c5f_2.conda -https://repo.anaconda.com/pkgs/main/osx-64/tk-8.6.14-h4d00af3_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/yaml-cpp-0.8.0-hcec6c5f_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/krb5-1.20.1-h428f121_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/libnghttp2-1.57.0-h9beae6a_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/libsolv-0.7.24-hfff2838_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/libssh2-1.11.0-hf20ceda_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/libxml2-2.10.4-h45904e2_2.conda -https://repo.anaconda.com/pkgs/main/osx-64/sqlite-3.45.3-h6c40b1e_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/zstd-1.5.5-hc035e20_2.conda -https://repo.anaconda.com/pkgs/main/osx-64/libarchive-3.6.2-h29ab7a1_3.conda -https://repo.anaconda.com/pkgs/main/osx-64/libcurl-8.7.1-hf20ceda_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/python-3.9.19-h5ee71fb_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/libmamba-1.5.8-h258a794_2.conda -https://repo.anaconda.com/pkgs/main/osx-64/menuinst-2.1.2-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/noarch/archspec-0.2.3-pyhd3eb1b0_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/boltons-23.0.0-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/brotli-python-1.0.9-py39hcec6c5f_8.conda -https://repo.anaconda.com/pkgs/main/osx-64/certifi-2024.7.4-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/noarch/charset-normalizer-3.3.2-pyhd3eb1b0_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/distro-1.9.0-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/frozendict-2.4.2-py39h6c40b1e_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/idna-3.7-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/noarch/jsonpointer-2.1-pyhd3eb1b0_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/libmambapy-1.5.8-py39h8c3233a_2.conda -https://repo.anaconda.com/pkgs/main/osx-64/packaging-24.1-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/platformdirs-3.10.0-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/pluggy-1.0.0-py39hecd8cb5_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/pycosat-0.6.6-py39h6c40b1e_1.conda -https://repo.anaconda.com/pkgs/main/noarch/pycparser-2.21-pyhd3eb1b0_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/pysocks-1.7.1-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/python.app-3-py39h9ed2024_0.conda -https://conda.anaconda.org/conda-forge/osx-64/python_abi-3.9-2_cp39.tar.bz2 -https://repo.anaconda.com/pkgs/main/osx-64/setuptools-72.1.0-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/tqdm-4.66.4-py39h20db666_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/wheel-0.43.0-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/cffi-1.16.0-py39h6c40b1e_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/jsonpatch-1.33-py39hecd8cb5_1.conda -https://repo.anaconda.com/pkgs/main/osx-64/pip-24.2-py39hecd8cb5_0.conda -https://conda.anaconda.org/conda-forge/osx-64/ruamel.yaml.clib-0.2.8-py39ha09f3b3_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/urllib3-2.2.2-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/requests-2.32.3-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/ruamel.yaml-0.17.21-py39hca72f7f_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/zstandard-0.22.0-py39h2d76c9a_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/conda-package-streaming-0.10.0-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/conda-package-handling-2.3.0-py39hecd8cb5_0.conda -https://repo.anaconda.com/pkgs/main/noarch/conda-libmamba-solver-24.7.0-pyhd3eb1b0_0.conda -https://repo.anaconda.com/pkgs/main/osx-64/conda-24.7.1-py39hecd8cb5_0.conda diff --git a/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/README.md b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/README.md new file mode 100644 index 0000000..95817c5 --- /dev/null +++ b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/README.md @@ -0,0 +1,19 @@ +The scripts in this folder were used to assess gene coexpression conservation, following methods introduced by [Crow et al. 2022](https://dx.doi.org/10.1093/nar/gkac276). See methods sections "Co-expression Conservation". + +The files are: +- `coexpr_cons_Hs_vs_Cr.py` is the python script that calculates coexpression AUROC scores, as described in Crow et al. 2022. The script outputs a folder `coexpr_cons_Hs_vs_Cr_output/` containing: + - `coexpr_conservation_all.csv`, a CSV containing AUROC scores. Note that the column `raw_auroc` is used in the paper (Supplementary Fig. 9), and that the column `coexpr_conservation` contains a normalized coexpression conservation score described in [Suresh et al. 2023](http://dx.doi.org/10.1038/s41559-023-02186-7) which was not used in our study. See the section "Note on scaled AUROC score" for more detail. + - `network1_gene_list.npy` and `network1_weights.npy`, gene coexpression network connectivity matrices + - PDFs with example AUROC curves for select gene pairs +- `tf_coexpr_Hs_vs_Cr.py` is a python script that looks at coexpression conservation scores for a select set of hematopoiesis-related TFs. The script saves outputs in a folder `tf_coexpr_Hs_vs_Cr_output/` containing graphs in Fig. 6f-h and Supplementary Fig. 9. +- `coexpr_cons_Hs_vs_Dr.py` and `tf_coexpr_Hs_vs_Dr.py` are python scripts which repeat these analyses for the human vs. zebrafish comparison. +- `single_cell_analysis_env.yml` and `single_cell_analysis_env.txt` contain information on the conda environment used, saved to a text file (by running `conda env export --no-builds > single_cell_analysis_env.yml` and `conda list --explicit > single_cell_analysis_env.txt`). The .txt files is for MacOS only. + +Note that the functions for actually calculating coexpression conservation AUROCs are in the helper_functions folder: `Scully_Ciona_blood_2025/helper_functions/gillis_style_coexpression_hf.py`. These functions are imported by the python scripts in this folder. + +## Note on scaled AUROC score +The coexpression conservation score calculation was introduced in a few papers from Jesse Gillis's group. [Crow et al. 2022](https://dx.doi.org/10.1093/nar/gkac276) introduces a coexpression conservation score based on calculation of an AUROC for N-to-M gene homology. [Suresh et al. 2023](http://dx.doi.org/10.1038/s41559-023-02186-7) adds an additional step to get a scaled coexpression conservation score: +1. Calculate AUROC scores for all gene pairs, not just homologous pairs, then +2. Calculate a final coexpression similarity score for each homologous gene pair as ranked with respect to the AUROCs for all possible gene pairs. Normalize to 1 to get the final score. + +In our paper, we only use the AUROC score as introduced by Crow et al., but the code supports an approximation of the Suresh et al. ranking score. To improve script run times, we approximate the Suresh et al. ranking score by calculating AUROCs for a large number of non-homologous gene pairs (rather than for all possible pairs). The function `calculate_coexpression_conservation_complex_orthology` takes an input `num_random_pairs`, which sets the number of non-homologous gene pairs to look at. If this parameter is not specified, the functions default to setting `num_random_pairs` to be 100 times the number of homologous gene pairs, such that homologous gene pairs make up only 1% of the final AUROC distribution. diff --git a/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/coexpr_cons_Hs_vs_Cr.py b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/coexpr_cons_Hs_vs_Cr.py new file mode 100644 index 0000000..559d39d --- /dev/null +++ b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/coexpr_cons_Hs_vs_Cr.py @@ -0,0 +1,142 @@ +import os, sys +import numpy as np +import pandas as pd +import scanpy as sc +import matplotlib.pyplot as plt + +# Change this path to point to folder containing helper functions scripts +path_to_repo_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(os.path.join(path_to_repo_dir, 'helper_functions')) +import scrna_helper_functions as hf +import gillis_style_coexpression_hf as coexpr + +# ============================================================================ +# OUTPUT PATH + +out_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + 'coexpr_cons_Hs_vs_Cr_output') +if not os.path.exists(out_path): os.mkdir(out_path) + +# ============================================================================ +# THESE SPECIES + +species1 = hf.Species('Ciona', 'robusta') +species2 = hf.Species('Homo', 'sapiens') + +# ------------------------------------ +# Ciona dataset + +print('Importing C. robusta dataset...') +adata1 = sc.read_h5ad(hf.path.Crob_adata_file) + +# ------------------------------------ +# Human dataset + +print('Importing human dataset...') +adata2 = sc.read_h5ad(os.path.join(hf.path.external_datasets, + 'Hs_Tabula_Sapiens_blood_bone_marrow.h5ad')) + +# Save a copy of raw (unnormalized data) +adata2.layers['raw_unnorm_expression'] = adata2.X + +# Total count normalization +# (without normalizing layers['raw_unnorm_expression']) +norm = sc.pp.normalize_total(adata2, exclude_highly_expressed=False, + inplace=False, target_sum=10**4) +adata2.X = norm['X'] +del norm + +# Identify highly variable genes using Klein et al. 2015 method +# (assumes normalized counts but NOT log transformed) +hvg_list = hf.find_hvgs(adata2, min_cells=5) +plt.title(f'{len(hvg_list)} highly variable genes') +plt.close() +adata2.var['highly_variable'] = False +for g_idx in hvg_list: + g = adata2.var.index[g_idx] + adata2.var.loc[g, 'highly_variable'] = True +print(f'highly variable genes: {len(hvg_list)} passed') + +# Logarithmize +sc.pp.log1p(adata2, base=10) + +# Before filtering, set `adata2.raw` to normalized & logarithmized data +adata2.raw = adata2 + +# Scale data +sc.pp.scale(adata2) + +# Run PCA on z-scored data +sc.tl.pca(adata2, use_highly_variable=True, n_comps=50) + +# Identify k nearest neighbors (Euclidean distance in PCA space) +sc.pp.neighbors(adata2, n_neighbors=15, n_pcs=50, use_rep='X_pca') + +# ============================================================================ +# Get list of genes for coexpression network + +# Only consider genes detected in >20 cells +gene_bool1 = np.array([np.sum(adata1.raw.X != 0, axis=0) > 20]).flatten() +gene_bool2 = np.array([np.sum(adata2.raw.X != 0, axis=0) > 20]).flatten() +gene_list1 = adata1.var.index[gene_bool1] +gene_list2 = adata2.var.index[gene_bool2] + +# Filter to known orthology gene lists (based on OrthoFinder) +gene_list1 = np.array([g for g in gene_list1 if g in hf.ciona2human]) +gene_list2 = np.array([g for g in gene_list2 if g in hf.human2ciona]) + +# ============================================================================ +# Get co-expression ranked networks + +# Generate gene coexpression ranked networks +print('Coexpression ranked networks') +print(' species 1') +rank1 = coexpr.coexpr_network(adata1, gene_list1) +print(' species 2') +rank2 = coexpr.coexpr_network(adata2, gene_list2) + +# Save networks +np.save(os.path.join(out_path, 'network1_weights'), rank1) +np.save(os.path.join(out_path, 'network2_weights'), rank2) +np.save(os.path.join(out_path, 'network1_gene_list'), gene_list1) +np.save(os.path.join(out_path, 'network2_gene_list'), gene_list2) + +# Coexpression conservation scores +print('Coexpression conservation scores') +example_gene_to_plot = [ + ('KY21.Chr12.541', 'MAFB'), + ('KY21.Chr6.542', 'SPI1'), + ('KY21.Chr3.726', 'CEBPA'), + ('KY21.Chr7.830', 'SOX18'), + ('KY21.Chr2.1280', 'MKI67'), + ('KY21.Chr3.661', 'MMP2'), + ('KY21.Chr3.661', 'MMP9'), +] +results = coexpr.calculate_coexpression_conservation_complex_orthology( + [rank1, rank2], + gene_list1, gene_list2, + orthology_1to2=hf.ciona2human, + orthology_2to1=hf.human2ciona, + n=20, + example_gene_to_plot=example_gene_to_plot, + save_as=os.path.join(out_path, 'example_'), + num_random_pairs=1, # Set to a large number for Suresh et al.-like score + # (see README.md in this directory for explanation) +) + +orthologs_coexpr = pd.DataFrame(index=results['gene_pairs']) +orthologs_coexpr[f'{species1.Xx}_gene'] = [ + gene_list1[pair[0]] for pair in results['gene_pairs'] +] +orthologs_coexpr[f'{species2.Xx}_gene'] = [ + gene_list2[pair[1]] for pair in results['gene_pairs'] +] +orthologs_coexpr['coexpr_conservation'] = results['coexpr_conservation'] +orthologs_coexpr['raw_auroc'] = results['auroc_orthologs'] + +# Save scores +filename = 'coexpr_conservation_all' +orthologs_coexpr.to_csv(os.path.join(out_path, filename + '.csv')) + +# Save non-orthologous genes distribution of AUROCs +np.save(os.path.join(out_path, 'non_orthologs_aurocs'), results['auroc_non_orthologs']) diff --git a/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/coexpr_cons_Hs_vs_Dr.py b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/coexpr_cons_Hs_vs_Dr.py new file mode 100644 index 0000000..4c2f3f6 --- /dev/null +++ b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/coexpr_cons_Hs_vs_Dr.py @@ -0,0 +1,174 @@ +import os, sys +import pickle +import numpy as np +import pandas as pd +import scanpy as sc +import matplotlib.pyplot as plt + +# Change this path to point to folder containing helper functions scripts +path_to_repo_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(os.path.join(path_to_repo_dir, 'helper_functions')) +import scrna_helper_functions as hf +import gillis_style_coexpression_hf as coexpr + +# ============================================================================ +# OUTPUT PATH + +out_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + 'coexpr_cons_Hs_vs_Dr_output') +if not os.path.exists(out_path): os.mkdir(out_path) + +# ============================================================================ +# THESE SPECIES + +species1 = hf.Species('Danio', 'rerio') +species2 = hf.Species('Homo', 'sapiens') + +# ------------------------------------ +# Zfish dataset + +print('Importing zebrafish dataset...') +adata1 = sc.read_h5ad(os.path.join(hf.path.external_datasets, + 'Dr_zfish_kidney_marrow_Wang.h5ad')) + +# Save a copy of raw (unnormalized data) +adata1.layers['raw_unnorm_expression'] = adata1.X + +# Total count normalization +# (without normalizing layers['raw_unnorm_expression']) +norm = sc.pp.normalize_total(adata1, exclude_highly_expressed=False, + inplace=False, target_sum=10**4) +adata1.X = norm['X'] +del norm + +# Logarithmize +sc.pp.log1p(adata1, base=10) + +# Set `adata.raw` to normalized & logarithmized data +adata1.raw = adata1 + +# PCA and nearest neighbor graph +sc.pp.pca(adata1) +sc.pp.neighbors(adata1) + +# ------------------------------------ +# Human dataset + +print('Importing human dataset...') +adata2 = sc.read_h5ad(os.path.join(hf.path.external_datasets, + 'Hs_Tabula_Sapiens_blood_bone_marrow.h5ad')) + +# Save a copy of raw (unnormalized data) +adata2.layers['raw_unnorm_expression'] = adata2.X + +# Total count normalization +# (without normalizing layers['raw_unnorm_expression']) +norm = sc.pp.normalize_total(adata2, exclude_highly_expressed=False, + inplace=False, target_sum=10**4) +adata2.X = norm['X'] +del norm + +# Identify highly variable genes using Klein et al. 2015 method +# (assumes normalized counts but NOT log transformed) +hvg_list = hf.find_hvgs(adata2, min_cells=5) +plt.title(f'{len(hvg_list)} highly variable genes') +plt.close() +adata2.var['highly_variable'] = False +for g_idx in hvg_list: + g = adata2.var.index[g_idx] + adata2.var.loc[g, 'highly_variable'] = True +print(f'highly variable genes: {len(hvg_list)} passed') + +# Logarithmize +sc.pp.log1p(adata2, base=10) + +# Before filtering, set `adata2.raw` to normalized & logarithmized data +adata2.raw = adata2 + +# Scale data +sc.pp.scale(adata2) + +# Run PCA on z-scored data +sc.tl.pca(adata2, use_highly_variable=True, n_comps=50) + +# Identify k nearest neighbors (Euclidean distance in PCA space) +sc.pp.neighbors(adata2, n_neighbors=15, n_pcs=50, use_rep='X_pca') + +# ============================================================================ +# Orthology dict + +print('\nGetting OrthoFinder orthologs for all gene pairs...') +path_to_dict = os.path.join(path_to_repo_dir, 'helper_functions', 'gene_dictionaries') +with open(os.path.join(path_to_dict, 'hsap2drer.pickle'), 'rb') as fp: + hs2dr = pickle.load(fp) + dr2hs = pickle.load(fp) + +# ============================================================================ +# Get list of genes for coexpression network + +# Only consider genes detected in >20 cells +gene_bool1 = np.array([np.sum(adata1.raw.X != 0, axis=0) > 20]).flatten() +gene_bool2 = np.array([np.sum(adata2.raw.X != 0, axis=0) > 20]).flatten() +gene_list1 = adata1.var.index[gene_bool1] +gene_list2 = adata2.var.index[gene_bool2] + +# Filter to known orthology gene lists (based on OrthoFinder) +gene_list1 = np.array([g for g in gene_list1 if g in dr2hs]) +gene_list2 = np.array([g for g in gene_list2 if g in hs2dr]) + +# ============================================================================ +# Get co-expression ranked networks + +# Generate gene coexpression ranked networks +print('Coexpression ranked networks') +print(' species 1') +rank1 = coexpr.coexpr_network(adata1, gene_list1) +print(' species 2') +rank2 = coexpr.coexpr_network(adata2, gene_list2) + +# Save networks +np.save(os.path.join(out_path, 'network1_weights'), rank1) +np.save(os.path.join(out_path, 'network2_weights'), rank2) +np.save(os.path.join(out_path, 'network1_gene_list'), gene_list1) +np.save(os.path.join(out_path, 'network2_gene_list'), gene_list2) + +# Coexpression conservation scores +print('Coexpression conservation scores') +example_gene_to_plot = [ + ('gfi1aa', 'GFI1'), + ('gfi1ab', 'GFI1'), + ('cebpa', 'CEBPA'), + ('spi1', 'SPI1'), + ('stat5a', 'STAT5A'), + ('stat5b', 'STAT5A'), + ('mki67', 'MKI67'), +] +results = coexpr.calculate_coexpression_conservation_complex_orthology( + [rank1, rank2], + gene_list1, gene_list2, + orthology_1to2=dr2hs, + orthology_2to1=hs2dr, + n=20, + example_gene_to_plot=example_gene_to_plot, + save_as=os.path.join(out_path, 'example_'), + num_random_pairs=1, # Set to a large number for Suresh et al.-like score + # (see README.md in this directory for explanation) + orthology_dict=dr2hs, +) + +orthologs_coexpr = pd.DataFrame(index=results['gene_pairs']) +orthologs_coexpr[f'{species1.Xx}_gene'] = [ + gene_list1[pair[0]] for pair in results['gene_pairs'] +] +orthologs_coexpr[f'{species2.Xx}_gene'] = [ + gene_list2[pair[1]] for pair in results['gene_pairs'] +] +orthologs_coexpr['coexpr_conservation'] = results['coexpr_conservation'] +orthologs_coexpr['raw_auroc'] = results['auroc_orthologs'] + +# Save scores +filename = 'coexpr_conservation_all' +orthologs_coexpr.to_csv(os.path.join(out_path, filename + '.csv')) + +# Save non-orthologous genes distribution of AUROCs +np.save(os.path.join(out_path, 'non_orthologs_aurocs'), results['auroc_non_orthologs']) diff --git a/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/single_cell_analysis_env.txt b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/single_cell_analysis_env.txt new file mode 100644 index 0000000..da1c766 --- /dev/null +++ b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/single_cell_analysis_env.txt @@ -0,0 +1,39 @@ +# This file may be used to create an environment using: +# $ conda create --name --file +# platform: osx-64 +@EXPLICIT +https://repo.anaconda.com/pkgs/main/osx-64/ca-certificates-2021.7.5-hecd8cb5_1.conda +https://repo.anaconda.com/pkgs/main/osx-64/libcxx-10.0.0-1.conda +https://repo.anaconda.com/pkgs/main/osx-64/libsodium-1.0.18-h1de35cc_0.conda +https://repo.anaconda.com/pkgs/main/osx-64/xz-5.2.5-h1de35cc_0.conda +https://repo.anaconda.com/pkgs/main/osx-64/zlib-1.2.11-h1de35cc_3.conda +https://repo.anaconda.com/pkgs/main/osx-64/libffi-3.3-hb1e8313_2.conda +https://repo.anaconda.com/pkgs/main/osx-64/ncurses-6.2-h0a44026_1.conda +https://repo.anaconda.com/pkgs/main/osx-64/openssl-1.1.1k-h9ed2024_0.conda +https://repo.anaconda.com/pkgs/main/osx-64/tk-8.6.10-hb0a8c7a_0.conda +https://repo.anaconda.com/pkgs/main/osx-64/zeromq-4.3.4-h23ab428_0.conda +https://repo.anaconda.com/pkgs/main/osx-64/readline-8.1-h9ed2024_0.conda +https://repo.anaconda.com/pkgs/main/osx-64/sqlite-3.36.0-hce871da_0.conda +https://repo.anaconda.com/pkgs/main/osx-64/python-3.8.10-h88f2d9e_7.conda +https://repo.anaconda.com/pkgs/main/osx-64/appnope-0.1.2-py38hecd8cb5_1001.conda +https://repo.anaconda.com/pkgs/main/noarch/backcall-0.2.0-pyhd3eb1b0_0.tar.bz2 +https://repo.anaconda.com/pkgs/main/osx-64/certifi-2021.5.30-py38hecd8cb5_0.conda +https://repo.anaconda.com/pkgs/main/noarch/ipython_genutils-0.2.0-pyhd3eb1b0_1.conda +https://repo.anaconda.com/pkgs/main/noarch/parso-0.8.2-pyhd3eb1b0_0.conda +https://repo.anaconda.com/pkgs/main/noarch/pickleshare-0.7.5-pyhd3eb1b0_1003.conda +https://repo.anaconda.com/pkgs/main/noarch/ptyprocess-0.7.0-pyhd3eb1b0_2.conda +https://repo.anaconda.com/pkgs/main/osx-64/pyzmq-20.0.0-py38h23ab428_1.conda +https://repo.anaconda.com/pkgs/main/noarch/six-1.16.0-pyhd3eb1b0_0.conda +https://repo.anaconda.com/pkgs/main/osx-64/tornado-6.1-py38h9ed2024_0.conda +https://repo.anaconda.com/pkgs/main/noarch/wcwidth-0.2.5-py_0.conda +https://repo.anaconda.com/pkgs/main/noarch/wheel-0.36.2-pyhd3eb1b0_0.conda +https://repo.anaconda.com/pkgs/main/noarch/pexpect-4.8.0-pyhd3eb1b0_3.conda +https://repo.anaconda.com/pkgs/main/noarch/python-dateutil-2.8.2-pyhd3eb1b0_0.conda +https://repo.anaconda.com/pkgs/main/osx-64/setuptools-52.0.0-py38hecd8cb5_0.conda +https://repo.anaconda.com/pkgs/main/noarch/traitlets-5.0.5-pyhd3eb1b0_0.tar.bz2 +https://repo.anaconda.com/pkgs/main/osx-64/jupyter_core-4.7.1-py38hecd8cb5_0.conda +https://repo.anaconda.com/pkgs/main/noarch/matplotlib-inline-0.1.2-pyhd3eb1b0_2.conda +https://repo.anaconda.com/pkgs/main/osx-64/pip-21.1.3-py38hecd8cb5_0.conda +https://repo.anaconda.com/pkgs/main/noarch/pygments-2.9.0-pyhd3eb1b0_0.conda +https://repo.anaconda.com/pkgs/main/noarch/jupyter_client-6.1.12-pyhd3eb1b0_0.conda +https://repo.anaconda.com/pkgs/main/osx-64/ipykernel-5.3.4-py38h5ca1d4c_0.conda diff --git a/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/single_cell_analysis_env.yml b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/single_cell_analysis_env.yml new file mode 100644 index 0000000..d23db90 --- /dev/null +++ b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/single_cell_analysis_env.yml @@ -0,0 +1,157 @@ +name: sc_env +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - appnope=0.1.2 + - backcall=0.2.0 + - ca-certificates=2021.7.5 + - certifi=2021.5.30 + - ipykernel=5.3.4 + - ipython_genutils=0.2.0 + - jupyter_client=6.1.12 + - jupyter_core=4.7.1 + - libcxx=10.0.0 + - libffi=3.3 + - libsodium=1.0.18 + - matplotlib-inline=0.1.2 + - ncurses=6.2 + - openssl=1.1.1k + - parso=0.8.2 + - pexpect=4.8.0 + - pickleshare=0.7.5 + - pip=21.1.3 + - ptyprocess=0.7.0 + - pygments=2.9.0 + - python=3.8.10 + - python-dateutil=2.8.2 + - pyzmq=20.0.0 + - readline=8.1 + - setuptools=52.0.0 + - six=1.16.0 + - sqlite=3.36.0 + - tk=8.6.10 + - tornado=6.1 + - traitlets=5.0.5 + - wcwidth=0.2.5 + - wheel=0.36.2 + - xz=5.2.5 + - zeromq=4.3.4 + - zlib=1.2.11 + - pip: + - anndata==0.7.6 + - annoy==1.17.0 + - anyio==3.4.0 + - argon2-cffi==21.1.0 + - astroid==2.13.3 + - attrs==21.2.0 + - babel==2.9.1 + - bbknn==1.6.0 + - bleach==4.1.0 + - blosc2==2.0.0 + - cffi==1.15.0 + - charset-normalizer==2.0.7 + - cycler==0.10.0 + - cython==0.29.24 + - decorator==4.4.2 + - defusedxml==0.7.1 + - dill==0.3.6 + - entrypoints==0.3 + - et-xmlfile==1.1.0 + - fastcluster==1.2.6 + - fbpca==1.0 + - geosketch==1.2 + - h5py==3.3.0 + - harmonypy==0.0.9 + - idna==3.3 + - imageio==2.9.0 + - importlib-metadata==8.4.0 + - importlib-resources==5.4.0 + - intervaltree==3.1.0 + - ipython==7.25.0 + - isort==5.11.4 + - jedi==0.18.0 + - jinja2==3.0.3 + - joblib==1.2.0 + - json5==0.9.6 + - jsonschema==4.2.1 + - jupyter-server==1.12.0 + - jupyterlab==3.2.4 + - jupyterlab-pygments==0.1.2 + - jupyterlab-server==2.8.2 + - kiwisolver==1.3.1 + - lazy-object-proxy==1.9.0 + - leidenalg==0.8.7 + - llvmlite==0.41.1 + - louvain==0.7.0 + - lxml==4.8.0 + - markupsafe==2.0.1 + - matplotlib==3.4.2 + - matplotlib-venn==0.11.9 + - mccabe==0.7.0 + - mistune==0.8.4 + - msgpack==1.0.8 + - natsort==7.1.1 + - nbclassic==0.3.4 + - nbclient==0.5.9 + - nbconvert==6.3.0 + - nbformat==5.1.3 + - nest-asyncio==1.5.1 + - networkx==2.5.1 + - notebook==6.4.6 + - numba==0.58.1 + - numexpr==2.7.3 + - numpy==1.24.4 + - openpyxl==3.1.2 + - packaging==21.0 + - pandas==1.3.0 + - pandocfilters==1.5.0 + - patsy==0.5.6 + - pillow==8.3.1 + - platformdirs==2.6.2 + - prometheus-client==0.12.0 + - prompt-toolkit==3.0.19 + - py-cpuinfo==9.0.0 + - pycparser==2.21 + - pydot==3.0.2 + - pylint==2.15.10 + - pynndescent==0.5.4 + - pyparsing==3.1.4 + - pyrsistent==0.18.0 + - python-igraph==0.9.6 + - python-louvain==0.16 + - pytz==2021.1 + - pywavelets==1.1.1 + - requests==2.26.0 + - scanorama==1.7.3 + - scanpy==1.8.1 + - scikit-image==0.18.2 + - scikit-learn==1.2.1 + - scipy==1.10.1 + - scrublet==0.2.3 + - seaborn==0.11.1 + - send2trash==1.8.0 + - sinfo==0.3.4 + - sniffio==1.2.0 + - sortedcontainers==2.4.0 + - statsmodels==0.13.1 + - stdlib-list==0.8.0 + - tables==3.8.0 + - terminado==0.12.1 + - testpath==0.5.0 + - texttable==1.6.4 + - threadpoolctl==2.2.0 + - tifffile==2021.8.8 + - tomli==2.0.1 + - tomlkit==0.11.6 + - tqdm==4.61.2 + - typing-extensions==4.4.0 + - umap-learn==0.5.1 + - urllib3==1.26.7 + - vireosnp==0.5.8 + - webencodings==0.5.1 + - websocket-client==1.2.1 + - wrapt==1.14.1 + - xlrd==1.2.0 + - zipp==3.6.0 diff --git a/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/tf_coexpr_Hs_vs_Cr.py b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/tf_coexpr_Hs_vs_Cr.py new file mode 100644 index 0000000..0996b01 --- /dev/null +++ b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/tf_coexpr_Hs_vs_Cr.py @@ -0,0 +1,364 @@ +import os, sys +import numpy as np +import pandas as pd +import scanpy as sc +import matplotlib.pyplot as plt +import seaborn as sns +from scipy.stats import pearsonr +from statsmodels.stats.multitest import fdrcorrection + +# Change this path to point to folder containing gene_hf.py +# This imports dictionaries and functions for easily converting gene ids +path_to_repo_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(os.path.join(path_to_repo_dir, 'helper_functions')) +import scrna_helper_functions as hf +import gillis_style_coexpression_hf as coexpr + +# Plot style +plt.style.use('tal_paper') + +# ============================================================================ +# OUTPUT PATH + +out_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + 'tf_coexpr_Hs_vs_Cr_output') +if not os.path.exists(out_path): os.mkdir(out_path) + +# ============================================================================ +# THESE SPECIES + +species1 = hf.Species('Ciona', 'robusta') +species2 = hf.Species('Homo', 'sapiens') + +# ------------------------------------ +# Ciona dataset + +print('Importing C. robusta dataset...') +adata_cr = sc.read_h5ad(hf.path.Crob_adata_file) + +# ------------------------------------ +# Human dataset + +print('Importing human dataset...') +adata_hs = sc.read_h5ad(os.path.join(hf.path.external_datasets, + 'Hs_Tabula_Sapiens_blood_bone_marrow.h5ad')) + +# Save a copy of raw (unnormalized data) +adata_hs.layers['raw_unnorm_expression'] = adata_hs.X + +# Total count normalization +# (without normalizing layers['raw_unnorm_expression']) +norm = sc.pp.normalize_total(adata_hs, exclude_highly_expressed=False, + inplace=False, target_sum=10**4) +adata_hs.X = norm['X'] +del norm + +# Identify highly variable genes using Klein et al. 2015 method +# (assumes normalized counts but NOT log transformed) +hvg_list = hf.find_hvgs(adata_hs, min_cells=5) +plt.title(f'{len(hvg_list)} highly variable genes') +plt.close() +adata_hs.var['highly_variable'] = False +for g_idx in hvg_list: + g = adata_hs.var.index[g_idx] + adata_hs.var.loc[g, 'highly_variable'] = True +print(f'highly variable genes: {len(hvg_list)} passed') + +# Logarithmize +sc.pp.log1p(adata_hs, base=10) + +# Before filtering, set `adata_hs.raw` to normalized & logarithmized data +adata_hs.raw = adata_hs + +# Scale data +sc.pp.scale(adata_hs) + +# Run PCA on z-scored data +sc.tl.pca(adata_hs, use_highly_variable=True, n_comps=50) + +# Identify k nearest neighbors (Euclidean distance in PCA space) +sc.pp.neighbors(adata_hs, n_neighbors=15, n_pcs=50, use_rep='X_pca') + +# ------------------------------------ +# Coexpression conservation + +print('Importing coexpression conservation scores...') +path_to_coexpr_data = os.path.join(os.path.dirname(os.path.abspath(__file__)), + 'coexpr_cons_Hs_vs_Cr_output') +coexpr_conservation = pd.read_csv( + os.path.join(path_to_coexpr_data, 'coexpr_conservation_all.csv'), + index_col=0) + +# ============================================================================ +# Heatmap plotting function +# (Copied and adjusted from SAMap heatmap plotting function) + +def plot_together(mtx_hs, mtx_cr, save_as='heatmaps', cmap=None): + if cmap is None: + cmap = sns.diverging_palette(230, 10, s=100, l=35, as_cmap=True) + + # Set figure size and aspect ratios + num_human_genes = mtx_hs.shape[1] + num_ciona_genes = mtx_cr.shape[1] + total_gene_rows = max(num_human_genes, num_ciona_genes) + plot_width = 6.5 + plot_height_adjust = 1.4335 + f, axs = plt.subplots( + 1, 2, + figsize=(plot_width, 0.08618*total_gene_rows + plot_height_adjust), + gridspec_kw={'width_ratios': [mtx_hs.shape[0], mtx_cr.shape[0]]} + ) + f_cbar, ax_cbar = plt.subplots(1, 1, figsize=(0.75, 1.2)) + yticks = [[], []] + + # Get cbar range for all heatmaps + vmax = max(mtx_hs.max().max(), mtx_cr.max().max()) + vmin = min(mtx_hs.min().min(), mtx_cr.min().min()) + + # Plot human expression + sns.heatmap( + mtx_hs.T, + cmap=cmap, + center=0, vmax=vmax, vmin=vmin, + xticklabels=True, yticklabels=False, + ax=axs[0], cbar=False, + ) + yticks[0] = mtx_hs.columns + + # Plot ciona expression + sns.heatmap( + mtx_cr.T, + cmap=cmap, + center=0, vmax=vmax, vmin=vmin, + xticklabels=True, yticklabels=False, + ax=axs[1], + cbar_kws={'label': 'Expression z-score'}, cbar_ax=ax_cbar + ) + yticks[1] = mtx_cr.columns + + # Set gene labels after tight_layout() to get consistent heatmap widths + f.tight_layout() + for i, mtx in enumerate([mtx_hs, mtx_cr]): + axs[i].set_yticks(np.arange(len(yticks[i])) + 0.5) + axs[i].set_yticklabels(yticks[i]) + f.subplots_adjust(left=0.12, wspace=0.6) + + f.savefig(os.path.join(out_path, f'{save_as}.pdf')) + plt.close() + + f_cbar.tight_layout() + f_cbar.savefig(os.path.join(out_path, f'{save_as}_cbar.pdf')) + plt.close() + +# ============================================================================ +# Get cluster centroids + +centr_hs = hf.cluster_centroids(adata_hs, groupby='cell_type_coarse') +centr_cr = hf.cluster_centroids(adata_cr, groupby='cell_type') + +# Adjust human cell type names +human_cell_types = [x.replace('-positive', '+').replace(',', '')\ + .replace('Kuppfer', 'Kupffer cell') + for x in centr_hs.index] +centr_hs.index = human_cell_types + +centr_hs_norm = centr_hs / centr_hs.max() +centr_cr_norm = centr_cr / centr_cr.max() +centr_z_hs = (centr_hs - centr_hs.mean()) / centr_hs.std() +centr_z_cr = (centr_cr - centr_cr.mean()) / centr_cr.std() + +# ============================================================================ + +# Number of top coexpression partners in human to look at +n=30 + +# Import coexpression networks themselves +network1 = np.load(os.path.join(path_to_coexpr_data, 'network1_weights.npy')) +network2 = np.load(os.path.join(path_to_coexpr_data, 'network2_weights.npy')) +gene_list1 = np.load(os.path.join(path_to_coexpr_data, 'network1_gene_list.npy')) +gene_list2 = np.load(os.path.join(path_to_coexpr_data, 'network2_gene_list.npy')) +network1 = pd.DataFrame(network1, index=gene_list1, columns=gene_list1) +network2 = pd.DataFrame(network2, index=gene_list2, columns=gene_list2) + +top_n_genes1 = coexpr.get_top_n(np.array(network1), n=n) +top_n_genes2 = coexpr.get_top_n(np.array(network2), n=n) +top_n_genes1 = pd.DataFrame(top_n_genes1, index=gene_list1, columns=gene_list1) +top_n_genes2 = pd.DataFrame(top_n_genes2, index=gene_list2, columns=gene_list2) + +del gene_list1, gene_list2 + +# ------------------------------------ +# Coexpression conservation example + +goi_list = [] + +# Genes involved in human/vertebrate hematopoiesis +hs_gene_list = [ + # High coexpression conservation + 'E2F8', + + # Genes related to hematopoiesis + 'SPI1', + 'CEBPA', + 'CEBPB', + 'CEBPE', + 'IRF4', + 'IRF8', + 'JUN', + 'LEF1', + 'GFI1', + 'GATA2', + 'MITF', + 'STAT5A', + 'STAT5B', + # 'KLF4', # Low expression + 'RUNX1', + 'TAL1', + 'NFKB1', + # 'STAT3', # no Ciona ortholog + 'EGR1', + 'EGR2', + 'EGR3', + 'MAFB', + 'MAF', +] + +# For each human TF get homolog in Ciona +for g_hs in hs_gene_list: + g_cr_list = hf.human2ciona[g_hs] + for g_cr in g_cr_list: + if g_cr in hf.tf_dict: g_cr_print = hf.tf_dict[g_cr] + else: g_cr_print = g_cr + if g_cr in network1.index: + row = [g_hs, g_cr, g_cr_print] + if row not in goi_list: + goi_list.append(row) + +out_path2 = os.path.join(out_path, 'example_coepxression') +if not os.path.exists(out_path2): os.mkdir(out_path2) + +# Loop through TFs to make heatmap plots +score_per_tf = {} +for goi_hs, goi_cr, goi_cr_print in goi_list: + + # Get top n genes in human, sort by decreasing correlation rank + top_coexpr_partners2 = top_n_genes2.index[top_n_genes2.loc[goi_hs, :]] + + # Get orthologs of these coexpression partner genes in Ciona + orthologs_sp1 = [goi_cr] + for g in list(top_coexpr_partners2):#[goi_hs] + list(top_coexpr_partners2): + if g in hf.human2ciona: + for g_cr in hf.human2ciona[g]: + if g_cr not in orthologs_sp1: + orthologs_sp1.append(g_cr) + + # Ciona matrix to plot + mtx_cr = centr_z_cr[orthologs_sp1] + # Remove NaNs + mtx_cr = mtx_cr.loc[:, mtx_cr.isnull().sum() == 0] + # Order of genes + gene_order_cr = np.argsort(mtx_cr.corrwith(mtx_cr.iloc[:, 0]))[::-1] + mtx_cr = mtx_cr.iloc[:, gene_order_cr] + # Order of cell types + mtx_cr = mtx_cr.sort_values(by=mtx_cr.columns[0])[::-1] + # Gene labeling + gene_labels = [] + for g_cr in mtx_cr.columns: + this_str = g_cr.replace('KY21.Chr', 'C')\ + .replace('KY21.UAContig', 'UAC') + ' (' + g_hs_list = [g_hs for g_hs in hf.ciona2human[g_cr] + if (g_hs in top_coexpr_partners2) or (g_hs == goi_hs)] + this_str += '/'.join(g_hs_list) + ')' + gene_labels.append(this_str) + mtx_cr.columns = gene_labels + + # Human matrix to plot + mtx_hs = centr_z_hs[[goi_hs] + list(top_coexpr_partners2)] + # Order of genes + gene_ordered_list_hs = [] + for g_cr in mtx_cr.columns: + g_hs_str = g_cr.split('(')[1][:-1] + g_hs_list = g_hs_str.split('/') + for g_hs in g_hs_list: + if g_hs not in gene_ordered_list_hs: + gene_ordered_list_hs.append(g_hs) + mtx_hs = mtx_hs.loc[:, gene_ordered_list_hs] + # Order of cell types + mtx_hs = mtx_hs.sort_values(by=goi_hs)[::-1] + + if goi_hs in hs_gene_list: + plot_together(mtx_hs, mtx_cr, + save_as=os.path.join('example_coepxression', + f'{goi_hs}_{goi_cr}'), + cmap='seismic') + + # -------------------------------- + # Convert to score + + # Get correlation within Ciona genes + corr_values = [] + for g in mtx_cr.columns[1:]: + corr, pval = pearsonr(mtx_cr.iloc[:, 0], mtx_cr[g]) + corr_values.append([g, corr, pval]) + if pval > 0.05: break + corr_values = pd.DataFrame(corr_values, columns=['gene', 'corr', 'pval']) + passed, fdr = fdrcorrection(corr_values['pval'], alpha=0.05) + corr_values['fdr'] = fdr + + # Get genes above threshold + threshold_param = 'corr' + threshold = 0.5 + corr_values = corr_values[corr_values[threshold_param] > threshold] + + # Save score + score_per_tf[f'{goi_hs} vs. {goi_cr_print}'] = len(corr_values) + +# ------------------------------------ +# Summary plots + +# Bar graph summarizing values for several TFs +tf_list = np.array([tf for tf in score_per_tf]) +tf_scores = np.array([score_per_tf[tf] for tf in score_per_tf]) +reorder = np.argsort(tf_scores) +f = plt.figure(figsize=(1.75, 3)) +plt.barh(tf_list[reorder[:-1]], tf_scores[reorder[:-1]], color='#cc9200', zorder=3) +plt.barh(tf_list[reorder[-1]], tf_scores[reorder[-1]], color='#888888', zorder=3) +plt.ylabel('Cross-species TF pair') +plt.xlabel('Number of\nshared coexpr.\npartners') +plt.xlim(0, 30.5) +plt.ylim(-0.75, len(tf_list)-0.25) +plt.xticks(ticks=[0, 10, 20, 30], labels=[0, 10, 20, 30], fontsize=6.5) +plt.yticks(fontsize=6.5) +plt.grid(color='#d0d0d0', which='major', axis='x', linestyle='-', + linewidth=0.75, zorder=-1) +plt.tight_layout() +plt.savefig(os.path.join(out_path, 'summary_bar_graph.pdf')) +plt.close() + +# Bar graph with AUROCs instead +tf_list = [] +tf_scores = [] +for tf_hs, tf_cr, tf_cr_print in goi_list: + foo = coexpr_conservation.loc[(coexpr_conservation['Cr_gene'] == tf_cr) + * (coexpr_conservation['Hs_gene'] == tf_hs)] + if len(foo) > 0: + tf_scores.append(foo['raw_auroc'].values[0]) + tf_list.append(f'{tf_hs} vs. {tf_cr_print}') +tf_list = np.array(tf_list) +tf_scores = np.array(tf_scores) +reorder = np.argsort(tf_scores)#[::-1] +f = plt.figure(figsize=(1.75, 3)) +plt.barh(tf_list[reorder[:-1]], tf_scores[reorder[:-1]], color='#cc9200', zorder=3) +plt.barh(tf_list[reorder[-1]], tf_scores[reorder[-1]], color='#888888', zorder=3) +plt.axvline(x=0.5, linestyle='--', linewidth=0.75, color='k', zorder=4) +plt.ylabel('Cross-species TF pair') +plt.xlabel('Coexpression\nconservation\nscore (AUROC)') +plt.xlim(0, 1.01) +plt.ylim(-1, len(tf_list)) +plt.xticks(ticks=[0, 0.5, 1], labels=[0, 0.5, 1], fontsize=6.5) +plt.yticks(fontsize=6.5)#, rotation=90) +plt.grid(color='#d0d0d0', which='major', axis='x', linestyle='-', + linewidth=0.75, zorder=-1) +plt.tight_layout() +plt.savefig(os.path.join(out_path, 'summary_bar_graph_AUROCs.pdf')) +plt.close() diff --git a/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/tf_coexpr_Hs_vs_Dr.py b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/tf_coexpr_Hs_vs_Dr.py new file mode 100644 index 0000000..e8ed1af --- /dev/null +++ b/Scully_Ciona_blood_2025/species_comparisons/coexpression_conservation/tf_coexpr_Hs_vs_Dr.py @@ -0,0 +1,390 @@ +import os, sys +import numpy as np +import pandas as pd +import scanpy as sc +import matplotlib.pyplot as plt +import seaborn as sns +import pickle +from scipy.stats import pearsonr +from statsmodels.stats.multitest import fdrcorrection + +# Change this path to point to folder containing gene_hf.py +# This imports dictionaries and functions for easily converting gene ids +path_to_repo_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.append(os.path.join(path_to_repo_dir, 'helper_functions')) +import scrna_helper_functions as hf +import gillis_style_coexpression_hf as coexpr + +# Plot style +plt.style.use('tal_paper') + +# ============================================================================ +# OUTPUT PATH + +out_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), + 'tf_coexpr_Hs_vs_Dr_output') +if not os.path.exists(out_path): os.mkdir(out_path) + +# ============================================================================ +# THESE SPECIES + +species1 = hf.Species('Danio', 'rerio') +species2 = hf.Species('Homo', 'sapiens') + +# ------------------------------------ +# Zfish dataset + +print('Importing zebrafish dataset...') +adata_dr = sc.read_h5ad(os.path.join(hf.path.external_datasets, + 'Dr_zfish_kidney_marrow_Wang.h5ad')) + +# Save a copy of raw (unnormalized data) +adata_dr.layers['raw_unnorm_expression'] = adata_dr.X + +# Total count normalization +# (without normalizing layers['raw_unnorm_expression']) +norm = sc.pp.normalize_total(adata_dr, exclude_highly_expressed=False, + inplace=False, target_sum=10**4) +adata_dr.X = norm['X'] +del norm + +# Logarithmize +sc.pp.log1p(adata_dr, base=10) + +# Set `adata.raw` to normalized & logarithmized data +adata_dr.raw = adata_dr + +# PCA and nearest neighbor graph +sc.pp.pca(adata_dr) +sc.pp.neighbors(adata_dr) + +# ------------------------------------ +# Human dataset + +print('Importing human dataset...') +adata_hs = sc.read_h5ad(os.path.join(hf.path.external_datasets, + 'Hs_Tabula_Sapiens_blood_bone_marrow.h5ad')) + +# Save a copy of raw (unnormalized data) +adata_hs.layers['raw_unnorm_expression'] = adata_hs.X + +# Total count normalization +# (without normalizing layers['raw_unnorm_expression']) +norm = sc.pp.normalize_total(adata_hs, exclude_highly_expressed=False, + inplace=False, target_sum=10**4) +adata_hs.X = norm['X'] +del norm + +# Identify highly variable genes using Klein et al. 2015 method +# (assumes normalized counts but NOT log transformed) +hvg_list = hf.find_hvgs(adata_hs, min_cells=5) +plt.title(f'{len(hvg_list)} highly variable genes') +plt.close() +adata_hs.var['highly_variable'] = False +for g_idx in hvg_list: + g = adata_hs.var.index[g_idx] + adata_hs.var.loc[g, 'highly_variable'] = True +print(f'highly variable genes: {len(hvg_list)} passed') + +# Logarithmize +sc.pp.log1p(adata_hs, base=10) + +# Before filtering, set `adata_hs.raw` to normalized & logarithmized data +adata_hs.raw = adata_hs + +# Scale data +sc.pp.scale(adata_hs) + +# Run PCA on z-scored data +sc.tl.pca(adata_hs, use_highly_variable=True, n_comps=50) + +# Identify k nearest neighbors (Euclidean distance in PCA space) +sc.pp.neighbors(adata_hs, n_neighbors=15, n_pcs=50, use_rep='X_pca') + +# ------------------------------------ +# Orthology dict + +print('\nGetting OrthoFinder orthologs for all gene pairs...') +path_to_dict = os.path.join(path_to_repo_dir, 'helper_functions', 'gene_dictionaries') +with open(os.path.join(path_to_dict, 'hsap2drer.pickle'), 'rb') as fp: + hs2dr = pickle.load(fp) + dr2hs = pickle.load(fp) + +# ------------------------------------ +# Coexpression conservation + +print('Importing coexpression conservation scores...') +path_to_coexpr_data = os.path.join(os.path.dirname(os.path.abspath(__file__)), + 'coexpr_cons_Hs_vs_Dr_output') +coexpr_conservation = pd.read_csv( + os.path.join(path_to_coexpr_data, 'coexpr_conservation_all.csv'), + index_col=0) + +# ============================================================================ +# Heatmap plotting function +# (Copied and adjusted from SAMap heatmap plotting function) + +def plot_together(mtx_hs, mtx_dr, save_as='heatmaps', cmap=None): + if cmap is None: + cmap = sns.diverging_palette(230, 10, s=100, l=35, as_cmap=True) + + # Set figure size and aspect ratios + num_human_genes = mtx_hs.shape[1] + num_zfish_genes = mtx_dr.shape[1] + total_gene_rows = max(num_human_genes, num_zfish_genes) + plot_width = 6.5 + plot_height_adjust = 1.4335 + f, axs = plt.subplots( + 1, 2, + figsize=(plot_width, 0.08618*total_gene_rows + plot_height_adjust), + gridspec_kw={'width_ratios': [mtx_hs.shape[0], mtx_dr.shape[0]]} + ) + f_cbar, ax_cbar = plt.subplots(1, 1, figsize=(0.75, 1.2)) + yticks = [[], []] + + # Get cbar range for all heatmaps + vmax = max(mtx_hs.max().max(), mtx_dr.max().max()) + vmin = min(mtx_hs.min().min(), mtx_dr.min().min()) + + # Plot human expression + sns.heatmap( + mtx_hs.T, + cmap=cmap, + center=0, vmax=vmax, vmin=vmin, + xticklabels=True, yticklabels=False, + ax=axs[0], cbar=False, + ) + yticks[0] = mtx_hs.columns + + # Plot ciona expression + sns.heatmap( + mtx_dr.T, + cmap=cmap, + center=0, vmax=vmax, vmin=vmin, + xticklabels=True, yticklabels=False, + ax=axs[1], + cbar_kws={'label': 'Expression z-score'}, cbar_ax=ax_cbar + ) + yticks[1] = mtx_dr.columns + + # Set gene labels after tight_layout() to get consistent heatmap widths + f.tight_layout() + for i, mtx in enumerate([mtx_hs, mtx_dr]): + axs[i].set_yticks(np.arange(len(yticks[i])) + 0.5) + axs[i].set_yticklabels(yticks[i]) + f.subplots_adjust(left=0.12, wspace=0.6) + + f.savefig(os.path.join(out_path, f'{save_as}.pdf')) + plt.close() + + f_cbar.tight_layout() + f_cbar.savefig(os.path.join(out_path, f'{save_as}_cbar.pdf')) + plt.close() + +# ============================================================================ +# Get cluster centroids + +centr_hs = hf.cluster_centroids(adata_hs, groupby='cell_type_coarse') +centr_dr = hf.cluster_centroids(adata_dr, groupby='cell_type') + +# Adjust human cell type names +human_cell_types = [x.replace('-positive', '+').replace(',', '')\ + .replace('Kuppfer', 'Kupffer cell') + for x in centr_hs.index] +centr_hs.index = human_cell_types + +centr_hs_norm = centr_hs / centr_hs.max() +centr_dr_norm = centr_dr / centr_dr.max() +centr_z_hs = (centr_hs - centr_hs.mean()) / centr_hs.std() +centr_z_dr = (centr_dr - centr_dr.mean()) / centr_dr.std() + +# ============================================================================ + +# Number of top coexpression partners in human to look at +n=30 + +# Import coexpression networks themselves +network1 = np.load(os.path.join(path_to_coexpr_data, 'network1_weights.npy')) +network2 = np.load(os.path.join(path_to_coexpr_data, 'network2_weights.npy')) +gene_list1 = np.load(os.path.join(path_to_coexpr_data, 'network1_gene_list.npy')) +gene_list2 = np.load(os.path.join(path_to_coexpr_data, 'network2_gene_list.npy')) +network1 = pd.DataFrame(network1, index=gene_list1, columns=gene_list1) +network2 = pd.DataFrame(network2, index=gene_list2, columns=gene_list2) + +top_n_genes1 = coexpr.get_top_n(np.array(network1), n=n) +top_n_genes2 = coexpr.get_top_n(np.array(network2), n=n) +top_n_genes1 = pd.DataFrame(top_n_genes1, index=gene_list1, columns=gene_list1) +top_n_genes2 = pd.DataFrame(top_n_genes2, index=gene_list2, columns=gene_list2) + +del gene_list1, gene_list2 + +# ------------------------------------ +# Coexpression conservation example + +goi_list = [] + +# Genes involved in human/vertebrate hematopoiesis +hs_gene_list = [ + # High coexpression conservation + 'E2F8', + + # Genes related to hematopoiesis + 'SPI1', + 'CEBPA', + 'CEBPB', + # 'CEBPE', # no Zebrafish ortholog + 'IRF4', + 'IRF8', + 'JUN', + 'LEF1', + 'GFI1', + 'GATA2', + 'MITF', + 'STAT5A', + 'STAT5B', + # 'KLF4', # Low expression + 'RUNX1', + 'TAL1', + 'NFKB1', + # 'STAT3', # no Ciona ortholog + 'EGR1', + 'EGR2', + 'EGR3', + 'MAFB', + 'MAF', +] + +# For each human TF get homolog in Ciona +for g_hs in hs_gene_list: + g_dr_list = hs2dr[g_hs] + for g_dr in g_dr_list: + if g_dr in centr_dr.columns: + g_dr_print = g_dr + goi_list.append([g_hs, g_dr, g_dr_print]) + +out_path2 = os.path.join(out_path, 'example_coepxression') +if not os.path.exists(out_path2): os.mkdir(out_path2) + +score_per_tf = {} +for goi_hs, goi_dr, goi_dr_print in goi_list: + + # Get top n genes in human, sort by decreasing correlation rank + top_coexpr_partners2 = top_n_genes2.index[top_n_genes2.loc[goi_hs, :]] + + # Get orthologs of these genes in Ciona + orthologs_sp1 = [goi_dr] + for g in list(top_coexpr_partners2):#[goi_hs] + list(top_coexpr_partners2): + if g in hs2dr: + for g_dr in hs2dr[g]: + if (g_dr not in orthologs_sp1) and (g_dr in centr_dr.columns): + orthologs_sp1.append(g_dr) + + # Ciona matrix to plot + mtx_dr = centr_z_dr[orthologs_sp1] + # Remove NaNs + mtx_dr = mtx_dr.loc[:, mtx_dr.isnull().sum() == 0] + # Order of genes + gene_order_dr = np.argsort(mtx_dr.corrwith(mtx_dr.iloc[:, 0]))[::-1] + mtx_dr = mtx_dr.iloc[:, gene_order_dr] + # Order of cell types + mtx_dr = mtx_dr.sort_values(by=mtx_dr.columns[0])[::-1] + # Gene labeling + gene_labels = [] + for g_dr in mtx_dr.columns: + this_str = g_dr.replace('KY21.Chr', 'C')\ + .replace('KY21.UAContig', 'UAC') + ' (' + g_hs_list = [g_hs for g_hs in dr2hs[g_dr] + if (g_hs in top_coexpr_partners2) or (g_hs == goi_hs)] + this_str += '/'.join(g_hs_list) + ')' + gene_labels.append(this_str) + mtx_dr.columns = gene_labels + + # Human matrix to plot + mtx_hs = centr_z_hs[[goi_hs] + list(top_coexpr_partners2)] + # Order of genes + gene_ordered_list_hs = [] + for g_dr in mtx_dr.columns: + g_hs_str = g_dr.split('(')[1][:-1] + g_hs_list = g_hs_str.split('/') + for g_hs in g_hs_list: + if g_hs not in gene_ordered_list_hs: + gene_ordered_list_hs.append(g_hs) + mtx_hs = mtx_hs.loc[:, gene_ordered_list_hs] + # Order of cell types + mtx_hs = mtx_hs.sort_values(by=goi_hs)[::-1] + + plot_together(mtx_hs, mtx_dr, + save_as=os.path.join('example_coepxression', + f'{goi_hs}_{goi_dr}'), + cmap='seismic') + + # -------------------------------- + # Convert to score + + # Get correlation within Ciona genes + corr_values = [] + for g in mtx_dr.columns[1:]: + corr, pval = pearsonr(mtx_dr.iloc[:, 0], mtx_dr[g]) + corr_values.append([g, corr, pval]) + if pval > 0.05: break + corr_values = pd.DataFrame(corr_values, columns=['gene', 'corr', 'pval']) + passed, fdr = fdrcorrection(corr_values['pval'], alpha=0.05) + corr_values['fdr'] = fdr + + # Get genes above threshold + threshold_param = 'corr' + threshold = 0.5 + corr_values = corr_values[corr_values[threshold_param] > threshold] + + # Save score + score_per_tf[f'{goi_hs} vs. {goi_dr_print}'] = len(corr_values) + +# ------------------------------------ +# Summary plots + +# Bar graph summarizing values for several TFs +tf_list = np.array([tf for tf in score_per_tf]) +tf_scores = np.array([score_per_tf[tf] for tf in score_per_tf]) +reorder = np.argsort(tf_scores) +f = plt.figure(figsize=(1.75, 3)) +plt.barh(tf_list[reorder[:-1]], tf_scores[reorder[:-1]], color='#cc9200', zorder=3) +plt.barh(tf_list[reorder[-1]], tf_scores[reorder[-1]], color='#888888', zorder=3) +plt.ylabel('Cross-species TF pair') +plt.xlabel('Number of\nshared coexpr.\npartners') +plt.xlim(0, 30.5) +plt.ylim(-0.75, len(tf_list)-0.25) +plt.xticks(ticks=[0, 10, 20, 30], labels=[0, 10, 20, 30], fontsize=6.5) +plt.yticks(fontsize=6.5) +plt.grid(color='#d0d0d0', which='major', axis='x', linestyle='-', + linewidth=0.75, zorder=-1) +plt.tight_layout() +plt.savefig(os.path.join(out_path, 'summary_bar_graph.pdf')) +plt.close() + +# Bar graph with AUROCs instead +tf_list = [] +tf_scores = [] +for tf_hs, tf_dr, tf_dr_print in goi_list: + foo = coexpr_conservation.loc[(coexpr_conservation['Dr_gene'] == tf_dr) + * (coexpr_conservation['Hs_gene'] == tf_hs)] + if len(foo) > 0: + tf_scores.append(foo['raw_auroc'].values[0]) + tf_list.append(f'{tf_hs} vs. {tf_dr_print}') +tf_list = np.array(tf_list) +tf_scores = np.array(tf_scores) +reorder = np.argsort(tf_scores)#[::-1] +f = plt.figure(figsize=(1.75, 3)) +plt.barh(tf_list[reorder[:-1]], tf_scores[reorder[:-1]], color='#cc9200', zorder=3) +plt.barh(tf_list[reorder[-1]], tf_scores[reorder[-1]], color='#888888', zorder=3) +plt.axvline(x=0.5, linestyle='--', linewidth=0.75, color='k', zorder=4) +plt.ylabel('Cross-species TF pair') +plt.xlabel('Coexpression\nconservation\nscore (AUROC)') +plt.xlim(0, 1.01) +plt.ylim(-1, len(tf_list)) +plt.xticks(ticks=[0, 0.5, 1], labels=[0, 0.5, 1], fontsize=6.5) +plt.yticks(fontsize=6.5)#, rotation=90) +plt.grid(color='#d0d0d0', which='major', axis='x', linestyle='-', + linewidth=0.75, zorder=-1) +plt.tight_layout() +plt.savefig(os.path.join(out_path, 'summary_bar_graph_AUROCs.pdf')) +plt.close() diff --git a/Scully_Ciona_blood_2025/species_comparisons/gs_plots/README.md b/Scully_Ciona_blood_2025/species_comparisons/gs_plots/README.md index 0c2e900..548710f 100644 --- a/Scully_Ciona_blood_2025/species_comparisons/gs_plots/README.md +++ b/Scully_Ciona_blood_2025/species_comparisons/gs_plots/README.md @@ -3,6 +3,6 @@ The scripts in this folder were used to make generalized Sankey (GS) plots to co The files are: - `gs_plots_Hs_vs_Cr.py` is the python script that both makes GS plots and calculates Jaccard similarity scores for the human vs. _C. robusta_ comparison. The script outputs a folder `gs_plots_Hs_vs_Cr_output/` containing GS plots (with both humand and _C. robusta_ cell states at the top) and Jaccard similarity matrices. - `gs_plots_Hs_vs_Dr.py` is the python script which repeats these anlyses for the human vs. zebrafish comparison. The script saves outputs in a folder `gs_plots_Hs_vs_Dr_output/` containing containing GS plots (with both humand and zebrafish cell states at the top) and Jaccard similarity matrices. -- `single_cell_analysis_env.txt` contains information on the conda environment used, saved to a text file (by running `conda list --explicit > single_cell_analysis_env.txt`). +- `single_cell_analysis_env.yml` and `single_cell_analysis_env.txt` contain information on the conda environment used, saved to a text file (by running `conda env export --no-builds > single_cell_analysis_env.yml` and `conda list --explicit > single_cell_analysis_env.txt`). The .txt files is for MacOS only. Note that the functions for actually making the GS plots are in the helper_functions folder: `Scully_Ciona_blood_2025/helper_functions/gs_plot_helper_functions.py`. These functions are imported by the python scripts in this folder. diff --git a/Scully_Ciona_blood_2025/species_comparisons/gs_plots/single_cell_analysis_env.yml b/Scully_Ciona_blood_2025/species_comparisons/gs_plots/single_cell_analysis_env.yml new file mode 100644 index 0000000..d23db90 --- /dev/null +++ b/Scully_Ciona_blood_2025/species_comparisons/gs_plots/single_cell_analysis_env.yml @@ -0,0 +1,157 @@ +name: sc_env +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - appnope=0.1.2 + - backcall=0.2.0 + - ca-certificates=2021.7.5 + - certifi=2021.5.30 + - ipykernel=5.3.4 + - ipython_genutils=0.2.0 + - jupyter_client=6.1.12 + - jupyter_core=4.7.1 + - libcxx=10.0.0 + - libffi=3.3 + - libsodium=1.0.18 + - matplotlib-inline=0.1.2 + - ncurses=6.2 + - openssl=1.1.1k + - parso=0.8.2 + - pexpect=4.8.0 + - pickleshare=0.7.5 + - pip=21.1.3 + - ptyprocess=0.7.0 + - pygments=2.9.0 + - python=3.8.10 + - python-dateutil=2.8.2 + - pyzmq=20.0.0 + - readline=8.1 + - setuptools=52.0.0 + - six=1.16.0 + - sqlite=3.36.0 + - tk=8.6.10 + - tornado=6.1 + - traitlets=5.0.5 + - wcwidth=0.2.5 + - wheel=0.36.2 + - xz=5.2.5 + - zeromq=4.3.4 + - zlib=1.2.11 + - pip: + - anndata==0.7.6 + - annoy==1.17.0 + - anyio==3.4.0 + - argon2-cffi==21.1.0 + - astroid==2.13.3 + - attrs==21.2.0 + - babel==2.9.1 + - bbknn==1.6.0 + - bleach==4.1.0 + - blosc2==2.0.0 + - cffi==1.15.0 + - charset-normalizer==2.0.7 + - cycler==0.10.0 + - cython==0.29.24 + - decorator==4.4.2 + - defusedxml==0.7.1 + - dill==0.3.6 + - entrypoints==0.3 + - et-xmlfile==1.1.0 + - fastcluster==1.2.6 + - fbpca==1.0 + - geosketch==1.2 + - h5py==3.3.0 + - harmonypy==0.0.9 + - idna==3.3 + - imageio==2.9.0 + - importlib-metadata==8.4.0 + - importlib-resources==5.4.0 + - intervaltree==3.1.0 + - ipython==7.25.0 + - isort==5.11.4 + - jedi==0.18.0 + - jinja2==3.0.3 + - joblib==1.2.0 + - json5==0.9.6 + - jsonschema==4.2.1 + - jupyter-server==1.12.0 + - jupyterlab==3.2.4 + - jupyterlab-pygments==0.1.2 + - jupyterlab-server==2.8.2 + - kiwisolver==1.3.1 + - lazy-object-proxy==1.9.0 + - leidenalg==0.8.7 + - llvmlite==0.41.1 + - louvain==0.7.0 + - lxml==4.8.0 + - markupsafe==2.0.1 + - matplotlib==3.4.2 + - matplotlib-venn==0.11.9 + - mccabe==0.7.0 + - mistune==0.8.4 + - msgpack==1.0.8 + - natsort==7.1.1 + - nbclassic==0.3.4 + - nbclient==0.5.9 + - nbconvert==6.3.0 + - nbformat==5.1.3 + - nest-asyncio==1.5.1 + - networkx==2.5.1 + - notebook==6.4.6 + - numba==0.58.1 + - numexpr==2.7.3 + - numpy==1.24.4 + - openpyxl==3.1.2 + - packaging==21.0 + - pandas==1.3.0 + - pandocfilters==1.5.0 + - patsy==0.5.6 + - pillow==8.3.1 + - platformdirs==2.6.2 + - prometheus-client==0.12.0 + - prompt-toolkit==3.0.19 + - py-cpuinfo==9.0.0 + - pycparser==2.21 + - pydot==3.0.2 + - pylint==2.15.10 + - pynndescent==0.5.4 + - pyparsing==3.1.4 + - pyrsistent==0.18.0 + - python-igraph==0.9.6 + - python-louvain==0.16 + - pytz==2021.1 + - pywavelets==1.1.1 + - requests==2.26.0 + - scanorama==1.7.3 + - scanpy==1.8.1 + - scikit-image==0.18.2 + - scikit-learn==1.2.1 + - scipy==1.10.1 + - scrublet==0.2.3 + - seaborn==0.11.1 + - send2trash==1.8.0 + - sinfo==0.3.4 + - sniffio==1.2.0 + - sortedcontainers==2.4.0 + - statsmodels==0.13.1 + - stdlib-list==0.8.0 + - tables==3.8.0 + - terminado==0.12.1 + - testpath==0.5.0 + - texttable==1.6.4 + - threadpoolctl==2.2.0 + - tifffile==2021.8.8 + - tomli==2.0.1 + - tomlkit==0.11.6 + - tqdm==4.61.2 + - typing-extensions==4.4.0 + - umap-learn==0.5.1 + - urllib3==1.26.7 + - vireosnp==0.5.8 + - webencodings==0.5.1 + - websocket-client==1.2.1 + - wrapt==1.14.1 + - xlrd==1.2.0 + - zipp==3.6.0