From b400982eac0240d63379cf28c089db68077bee0b Mon Sep 17 00:00:00 2001 From: Francisco Maria Calisto Date: Sat, 28 Sep 2024 16:06:01 +0100 Subject: [PATCH] [UPDATE] DocStrings --- analysis/dashboards/interactive_dashboard.py | 50 ++++++++++++++++- analysis/plots/heatmap_modalities.py | 44 +++++++++++++++ analysis/plots/multi_panel_plot.py | 55 +++++++++++++++++++ analysis/plots/plot_birads_frequency.py | 42 ++++++++++++++ analysis/plots/plot_birads_mg_frequency.py | 55 +++++++++++++++++++ analysis/plots/plot_birads_mri_frequency.py | 38 +++++++++++++ .../plots/plot_birads_mri_side_by_side.py | 38 +++++++++++++ analysis/plots/plot_birads_us_frequency.py | 39 +++++++++++++ analysis/plots/plot_modalities.py | 38 +++++++++++++ analysis/plots/radar_chart.py | 38 +++++++++++++ analysis/plots/sankey_diagram_modalities.py | 39 +++++++++++++ analysis/plots/stacked_bar_chart.py | 38 +++++++++++++ analysis/plots/venn_diagram_modalities.py | 39 +++++++++++++ src/config/logging_config.py | 39 ++++++++++++- src/main.py | 43 +++++++++++++-- src/processing/anonymizer.py | 41 ++++++++++++++ src/processing/encryption.py | 37 +++++++++++++ src/processing/extractor.py | 41 ++++++++++++++ src/processing/processor.py | 39 +++++++++++++ src/utils/repeats.py | 34 ++++++++++-- src/validation/checker.py | 37 +++++++++++++ src/validation/compare.py | 41 ++++++++++++-- src/validation/identifier.py | 40 +++++++++++++- src/validation/laterality.py | 38 +++++++++++++ src/validation/reanonymizer.py | 42 ++++++++++++-- src/validation/reidentified.py | 38 ++++++++++++- tests/test_anonymizer.py | 39 +++++++++++++ tests/test_encryption.py | 34 ++++++++++++ tests/test_extractor.py | 41 ++++++++++++++ tests/test_processor.py | 13 ++++- 30 files changed, 1165 insertions(+), 25 deletions(-) diff --git a/analysis/dashboards/interactive_dashboard.py b/analysis/dashboards/interactive_dashboard.py index 91e3587..b1d1b8f 100644 --- a/analysis/dashboards/interactive_dashboard.py +++ b/analysis/dashboards/interactive_dashboard.py @@ -1,7 +1,55 @@ #!/usr/bin/env python """ -interactive_dashboard.py: Create an interactive dashboard to explore the frequency distribution of BIRADS scores across different imaging modalities. +interactive_dashboard.py: Create an interactive dashboard to explore the frequency +distribution of BIRADS scores across different imaging modalities. +This script reads a dataset of patients with BIRADS scores for mammography, +ultrasound, and MRI modalities. It then calculates the number of patients with +different combinations of modalities and creates an interactive dashboard. +The dashboard allows users to explore the distribution of BIRADS scores for +each modality and their combinations. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Create an interactive dashboard to explore the distribution of BIRADS scores. + +Expected Usage: +- Run the script to generate an interactive dashboard. +- Open the dashboard in a web browser to explore the data interactively. +- Update the script to customize the dataset or modify the dashboard as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the dashboard. +- The dashboard layout, style, and interactivity can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of imaging modalities for patients. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Dash library: https://dash.plotly.com/ +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis + +Example: +- Run the script to generate an interactive dashboard of BIRADS score distributions. +- Open the dashboard in a web browser to explore the data interactively. +- Update the script to customize the dataset or modify the dashboard as needed. +- python interactive_dashboard.py """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/heatmap_modalities.py b/analysis/plots/heatmap_modalities.py index 166f8cc..8edeab5 100644 --- a/analysis/plots/heatmap_modalities.py +++ b/analysis/plots/heatmap_modalities.py @@ -2,6 +2,50 @@ """ heatmap_modalities.py: Plot the frequency of patients with different combinations of imaging modalities using a heatmap. +This script reads a dataset of patients with BIRADS scores for mammography, ultrasound, and MRI modalities. +It then calculates the number of patients with different combinations of modalities and plots a heatmap. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Plot a heatmap showing the frequency of patients with different imaging modality combinations. + +Expected Usage: +- Run the script to generate a heatmap of imaging modality combinations. +- Check the output figure to visualize the frequency of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the heatmap. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of imaging modalities for patients. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Heatmaps: https://en.wikipedia.org/wiki/Heat_map +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis + +Example: +- Run the script to generate a heatmap of imaging modality combinations. +- Check the output figure to visualize the frequency of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. +- python heatmap_modalities.py """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/multi_panel_plot.py b/analysis/plots/multi_panel_plot.py index 8798caf..966a56e 100644 --- a/analysis/plots/multi_panel_plot.py +++ b/analysis/plots/multi_panel_plot.py @@ -2,6 +2,61 @@ """ multi_panel_plot.py: Create a multi-panel plot showing the number of patients per imaging modality. +This script reads a dataset of patients with BIRADS scores for mammography, ultrasound, and MRI modalities. +It then calculates the number of patients with different combinations of modalities and creates a multi-panel plot. +The plot shows the distribution of patients with mammography, ultrasound, and MRI images side by side. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Create a multi-panel plot showing the distribution of patients across different modality combinations. + +Expected Input: +- The script requires a dataset of patients with BIRADS scores for different modalities. +- The dataset should include columns for each modality (CCL, CCR, MLOL, MLOR, USL, USR, MRIL, MRIR) with BIRADS scores. +- The BIRADS scores should range from 1 to 5, with missing or invalid values handled appropriately. + +Output: +- The script generates an interactive HTML file with the multi-panel plot. +- The plot shows the distribution of patients with mammography, ultrasound, and MRI images. +- The chart can be viewed in a web browser or embedded in a web page. + +Expected Usage: +- Run the script to generate a multi-panel plot of imaging modality counts. +- Check the output figure to visualize the distribution of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the multi-panel plot. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of imaging modalities for patients. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Multi-panel plots: https://en.wikipedia.org/wiki/Multipanel_plot +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis + +Example: +- Run the script to generate a multi-panel plot of imaging modality counts. +- Check the output figure to visualize the distribution of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. +- python multi_panel_plot.py """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/plot_birads_frequency.py b/analysis/plots/plot_birads_frequency.py index 3cf12f5..fb6446d 100644 --- a/analysis/plots/plot_birads_frequency.py +++ b/analysis/plots/plot_birads_frequency.py @@ -3,6 +3,48 @@ """ plot_birads_frequency.py: Plot frequency of patients with MGs, US images, and MRIs for each BIRADS score, handling multiple entries per cell by considering the highest BIRADS score and multiple delimiters. + +This script reads a dataset of patients with BIRADS scores for mammography, ultrasound, and MRI modalities. +It then calculates the number of patients with different combinations of modalities and plots a stacked bar chart. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Plot a stacked bar chart showing the distribution of patients across different modality combinations. + +Expected Usage: +- Run the script to generate a stacked bar chart of imaging modality combinations. +- Check the output figure to visualize the distribution of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the stacked bar chart. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of imaging modalities for patients. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Stacked bar charts: https://en.wikipedia.org/wiki/Stacked_bar_chart +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis + +Example: +- Run the script to generate a stacked bar chart of imaging modality combinations. """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/plot_birads_mg_frequency.py b/analysis/plots/plot_birads_mg_frequency.py index 2db7a87..cf49f9d 100644 --- a/analysis/plots/plot_birads_mg_frequency.py +++ b/analysis/plots/plot_birads_mg_frequency.py @@ -3,6 +3,61 @@ """ plot_birads_mg_frequency.py: Plot the frequency of patients for each MG type (CCL, CCR, MLOL, MLOR) with BIRADS scores from 1 to 5. +This script reads a dataset of patients with BIRADS scores for mammography, +ultrasound, and MRI modalities. It then calculates the number of patients +with different combinations of modalities and plots a stacked bar chart. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Plot a stacked bar chart showing the distribution of patients across different modality combinations. + +Expected Input: +- The script requires a dataset of patients with BIRADS scores for different modalities. +- The dataset should include columns for each modality (CCL, CCR, MLOL, MLOR) with BIRADS scores. +- The BIRADS scores should range from 1 to 5, with missing or invalid values handled appropriately. + +Output: +- The script generates an interactive HTML file with the stacked bar chart. +- The chart shows the distribution of patients across different modality combinations. +- The chart can be viewed in a web browser or embedded in a web page. + +Expected Usage: +- Run the script to generate a stacked bar chart of imaging modality combinations. +- Check the output figure to visualize the distribution of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the stacked bar chart. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of imaging modalities for patients. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Stacked bar charts: https://en.wikipedia.org/wiki/Stacked_bar_chart +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis + +Example: +- Run the script to generate a stacked bar chart of imaging modality combinations. +- Check the output figure to visualize the distribution of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. +- python plot_birads_mg_frequency.py """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/plot_birads_mri_frequency.py b/analysis/plots/plot_birads_mri_frequency.py index 590044f..271d827 100644 --- a/analysis/plots/plot_birads_mri_frequency.py +++ b/analysis/plots/plot_birads_mri_frequency.py @@ -2,6 +2,44 @@ """ plot_birads_mri_frequency.py: Plot the frequency of patients with MRI images per BIRADS category. +This script reads a dataset of patients with BIRADS scores for mammography, ultrasound, and MRI modalities. +It then calculates the number of patients with different combinations of modalities and plots a stacked bar chart. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Plot a stacked bar chart showing the distribution of patients across different modality combinations. + +Expected Usage: +- Run the script to generate a stacked bar chart of imaging modality combinations. +- Check the output figure to visualize the distribution of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the stacked bar chart. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of imaging modalities for patients. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Stacked bar charts: https://en.wikipedia.org/wiki/Stacked_bar_chart +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/plot_birads_mri_side_by_side.py b/analysis/plots/plot_birads_mri_side_by_side.py index 2ce9ab3..e1e12b1 100644 --- a/analysis/plots/plot_birads_mri_side_by_side.py +++ b/analysis/plots/plot_birads_mri_side_by_side.py @@ -2,6 +2,44 @@ """ plot_birads_mri_side_by_side.py: Plot the frequency of patients with MRI images per BIRADS category, displayed side-by-side. +This script reads a dataset of patients with BIRADS scores for mammography, ultrasound, and MRI modalities. +It then calculates the number of patients with different combinations of modalities and plots a stacked bar chart. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Plot a stacked bar chart showing the distribution of patients across different modality combinations. + +Expected Usage: +- Run the script to generate a stacked bar chart of imaging modality combinations. +- Check the output figure to visualize the distribution of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the stacked bar chart. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of imaging modalities for patients. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Stacked bar charts: https://en.wikipedia.org/wiki/Stacked_bar_chart +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/plot_birads_us_frequency.py b/analysis/plots/plot_birads_us_frequency.py index fe730fc..6428073 100644 --- a/analysis/plots/plot_birads_us_frequency.py +++ b/analysis/plots/plot_birads_us_frequency.py @@ -3,6 +3,45 @@ """ plot_birads_us_frequency.py: Plot the frequency of patients with ultrasound images per BIRADS category. +This script reads a dataset of patients with BIRADS scores for mammography, +ultrasound, and MRI modalities. It then calculates the number of patients +with different combinations of modalities and plots a stacked bar chart. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Plot a stacked bar chart showing the distribution of patients across different modality combinations. + +Expected Usage: +- Run the script to generate a stacked bar chart of imaging modality combinations. +- Check the output figure to visualize the distribution of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the stacked bar chart. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of imaging modalities for patients. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Stacked bar charts: https://en.wikipedia.org/wiki/Stacked_bar_chart +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/plot_modalities.py b/analysis/plots/plot_modalities.py index 0e4f6c1..a4ad1cb 100644 --- a/analysis/plots/plot_modalities.py +++ b/analysis/plots/plot_modalities.py @@ -2,6 +2,44 @@ """ plot_modalities.py: Plot the number of patients per imaging modality. +This script reads a dataset of patients with BIRADS scores for mammography, ultrasound, and MRI modalities. +It then calculates the number of patients with different combinations of modalities and plots a stacked bar chart. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Plot a stacked bar chart showing the distribution of patients across different modality combinations. + +Expected Usage: +- Run the script to generate a stacked bar chart of imaging modality combinations. +- Check the output figure to visualize the distribution of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the stacked bar chart. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of imaging modalities for patients. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Stacked bar charts: https://en.wikipedia.org/wiki/Stacked_bar_chart +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/radar_chart.py b/analysis/plots/radar_chart.py index e15f621..2256469 100644 --- a/analysis/plots/radar_chart.py +++ b/analysis/plots/radar_chart.py @@ -2,6 +2,44 @@ """ radar_chart.py: Plot a radar chart showing the frequency distribution of BIRADS scores across different imaging modalities. +This script reads a dataset of patients with BIRADS scores for mammography, ultrasound, and MRI modalities. +It then calculates the frequency distribution of BIRADS scores for each modality and plots a radar chart. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Prepare data for radar chart visualization by processing BIRADS scores. +- Plot a radar chart showing the frequency distribution of BIRADS scores by modality. + +Expected Usage: +- Run the script to generate a radar chart of BIRADS score distributions. +- Check the output figure to visualize the distribution of BIRADS scores across different modalities. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the radar chart. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of BIRADS scores for different imaging modalities. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Radar charts: https://en.wikipedia.org/wiki/Radar_chart +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/sankey_diagram_modalities.py b/analysis/plots/sankey_diagram_modalities.py index daad4de..02e62bc 100644 --- a/analysis/plots/sankey_diagram_modalities.py +++ b/analysis/plots/sankey_diagram_modalities.py @@ -2,6 +2,45 @@ """ sankey_diagram_modalities.py: Plot a Sankey diagram showing the flow of patients through different imaging modalities. + +This script reads a dataset of patients with BIRADS scores for mammography, ultrasound, and MRI modalities. +It then calculates the number of patients with different combinations of modalities and plots a Sankey diagram. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Plot a Sankey diagram showing the flow of patients through different imaging modalities. + +Expected Usage: +- Run the script to generate a Sankey diagram of imaging modality combinations. +- Check the output figure to visualize the flow of patients through different modalities. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the Sankey diagram. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the flow of patients through different imaging modalities. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Sankey diagrams: https://en.wikipedia.org/wiki/Sankey_diagram +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/stacked_bar_chart.py b/analysis/plots/stacked_bar_chart.py index af54ea0..6f7eb9e 100644 --- a/analysis/plots/stacked_bar_chart.py +++ b/analysis/plots/stacked_bar_chart.py @@ -2,6 +2,44 @@ """ stacked_bar_chart.py: Plot the number of patients per imaging modality. +This script reads a dataset of patients with BIRADS scores for mammography, ultrasound, and MRI modalities. +It then calculates the number of patients with different combinations of modalities and plots a stacked bar chart. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Plot a stacked bar chart showing the distribution of patients across different modality combinations. + +Expected Usage: +- Run the script to generate a stacked bar chart of imaging modality combinations. +- Check the output figure to visualize the distribution of patients across different combinations. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the stacked bar chart. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the Plotly library to create interactive and visually appealing plots. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of imaging modalities for patients. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- Plotly library: https://plotly.com/python/ +- Stacked bar charts: https://en.wikipedia.org/wiki/Stacked_bar_chart +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis """ __author__ = "Francisco Maria Calisto" diff --git a/analysis/plots/venn_diagram_modalities.py b/analysis/plots/venn_diagram_modalities.py index 7e10cc4..66269c0 100644 --- a/analysis/plots/venn_diagram_modalities.py +++ b/analysis/plots/venn_diagram_modalities.py @@ -2,6 +2,45 @@ """ venn_diagram_modalities.py: Plot a Venn diagram of patients with different combinations of imaging modalities. + +This script reads a dataset of patients with BIRADS scores for mammography, ultrasound, and MRI modalities. +It then calculates the number of patients with different combinations of modalities and plots a Venn diagram. + +Key Functions: +- Load the dataset of patients with BIRADS scores for different modalities. +- Calculate the number of patients with different combinations of modalities. +- Plot a Venn diagram showing the overlap between different modalities. + +Expected Usage: +- Run the script to generate a Venn diagram of imaging modality combinations. +- Check the output figure to visualize the overlap between modalities. +- Update the script to customize the dataset or modify the plot as needed. + +Customization & Flexibility: +- The script can be adapted to work with different datasets or modalities. +- Additional metadata or information can be included in the Venn diagram. +- The plot style, colors, and labels can be customized based on requirements. + +Performance & Compatibility: +- The script is optimized for performance when handling large datasets. +- It uses the matplotlib_venn library to create Venn diagrams efficiently. +- The script is compatible with Python 3.6+ and common data science libraries. + +Best Practices & Maintenance: +- The script follows best practices for data visualization and analysis. +- It provides a clear and informative representation of imaging modality data. +- The script is well-documented and can be easily maintained or extended. + +Notes: +- This script is part of a data analysis pipeline for multimodal breast imaging data. +- It is designed to visualize the distribution of imaging modalities for patients. +- The script can be integrated into a larger data processing or analysis workflow. + +References: +- matplotlib_venn library: https://pypi.org/project/matplotlib-venn/ +- Venn diagrams: https://en.wikipedia.org/wiki/Venn_diagram +- Data visualization: https://en.wikipedia.org/wiki/Data_visualization +- Data analysis: https://en.wikipedia.org/wiki/Data_analysis """ __author__ = "Francisco Maria Calisto" diff --git a/src/config/logging_config.py b/src/config/logging_config.py index fdfd451..1c3355c 100644 --- a/src/config/logging_config.py +++ b/src/config/logging_config.py @@ -1,4 +1,41 @@ -# logging_config.py +#!/usr/bin/env python + +""" +logging_config.py: Module for setting up logging configuration using a YAML file. + +This module provides a function to set up logging configuration based on a YAML file. +The configuration file specifies the log file path, log level, and log format. + +Key Functions: +- setup_logging: Set up logging configuration using a YAML file. + +Expected Usage: +- Call `setup_logging` at the beginning of the main script to configure logging. +- Use the logging module to log messages to the specified log file and console. + +Customization & Flexibility: +- The logging configuration can be customized by editing the YAML file. +- Additional handlers or formatters can be added to the logging configuration. +- The log level can be adjusted to control the verbosity of log messages. + +Performance & Compatibility: +- The logging configuration is optimized for performance and resource usage. +- The module is compatible with Python 3.6+ and can be used in various environments and platforms. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This module is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. + +References: +- Logging configuration: https://docs.python.org/3/library/logging.config.html +- YAML format: https://yaml.org/spec/1.2/spec.html +""" import logging import logging.config diff --git a/src/main.py b/src/main.py index b15ca17..98fffbc 100644 --- a/src/main.py +++ b/src/main.py @@ -7,10 +7,45 @@ and runs the data processing pipeline by invoking the `process_directory` function from the `processing.processor` module. It handles DICOM files and logs the results efficiently. -Improvements: -- Added memory monitoring before and after batch processing. -- Implemented explicit garbage collection to optimize memory usage for large datasets. -- Enhanced logging to trace each step in detail. +Key Functions: +- Set up logging to capture runtime events and errors. +- Monitor memory usage before and after processing to optimize resource utilization. +- Process DICOM files in batches using the `process_directory` function. +- Save the mapping file with a timestamped filename for tracking purposes. + +Expected Input: +- A directory containing DICOM files for processing. +- Output directories for saving processed files and logs. +- A batch size for processing a specified number of files at a time. + +Output: +- Processed DICOM files saved in the output directory. +- A mapping file with the original and processed filenames for reference. +- Log files with detailed information on the processing steps and memory usage. + +Intended Use Case: +- This script is designed for processing large datasets of DICOM files efficiently. +- It can be used to curate and prepare medical imaging data for research or analysis. +- The script is optimized for handling memory-intensive tasks and monitoring resource usage. + +Customization & Flexibility: +- The batch size can be adjusted based on the available system resources. +- Additional logging configurations or output formats can be added for specific use cases. +- The script can be extended to support other types of medical imaging data or metadata. + +Performance & Compatibility: +- The script is optimized for performance and efficiency when processing large datasets. +- It uses memory monitoring and garbage collection to optimize resource utilization. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This script is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. """ __author__ = "Francisco Maria Calisto" diff --git a/src/processing/anonymizer.py b/src/processing/anonymizer.py index b8f730b..c88ea7f 100644 --- a/src/processing/anonymizer.py +++ b/src/processing/anonymizer.py @@ -5,6 +5,47 @@ information and renaming them according to a specified format. This script handles the anonymization of sensitive fields in DICOM files and provides options to save the metadata before and after the anonymization process. + +Key Functions: +- is_dicom_file: Check if a file is a valid DICOM file. +- anonymize_dicom_file: Anonymize a DICOM file by removing patient-related information. +- anonymize_field: Replace a DICOM field's value with a new value. +- anonymize_sequences: Anonymize specific DICOM sequence fields. +- save_dicom: Save the anonymized DICOM file with a new name. +- anonymize_directory: Anonymize all DICOM files in a directory. + +Expected Input: +- A directory containing DICOM files to be anonymized. +- An optional configuration file with anonymization rules. +- Output directory for saving the anonymized files. + +Output: +- Anonymized DICOM files saved in the output directory. +- Metadata files saved before and after anonymization for each DICOM file. + +Intended Use Case: +- This script is useful for anonymizing DICOM files for research or sharing purposes. +- It can be used to remove sensitive patient information from medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. + +Customization & Flexibility: +- The script can be extended to support additional anonymization rules or fields. +- It can be adapted to handle other types of medical imaging data or metadata. +- The script can be integrated into a larger data curation pipeline for multimodal breast imaging data. + +Performance & Compatibility: +- The script is optimized for performance and efficiency when processing DICOM files. +- It uses the pydicom library for reading and writing DICOM files. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This script is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. """ __author__ = "Francisco Maria Calisto" diff --git a/src/processing/encryption.py b/src/processing/encryption.py index 9f57b78..5a371c7 100644 --- a/src/processing/encryption.py +++ b/src/processing/encryption.py @@ -2,6 +2,43 @@ """ encryption.py: Module for encrypting patient IDs. + +This module provides functions for encrypting patient IDs using the SHA-256 hash function +combined with a secret phrase. The encrypted patient IDs are truncated to match the length +of the original IDs to ensure consistency and compatibility with existing data. + +Key Functions: +- read_secret_phrase: Read the secret phrase from an external file. +- encrypt_patient_id: Encrypt a patient ID using the secret phrase. + +Expected Usage: +- Read the secret phrase from a file using `read_secret_phrase`. +- Encrypt a patient ID using `encrypt_patient_id`. + +Customization & Flexibility: +- The encryption algorithm can be easily modified or extended to use different hash functions. +- The secret phrase can be updated or changed to enhance security and protect sensitive data. +- The encryption process can be adapted to include additional metadata or parameters. + +Performance & Compatibility: +- The encryption process is optimized for efficiency and security using the SHA-256 hash function. +- The module is compatible with Python 3.6+ and can be used in various environments and platforms. +- The encryption functions are designed to handle large datasets and sensitive patient information. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This module is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. + +References: +- SHA-256 hash function: https://en.wikipedia.org/wiki/SHA-2 +- Secure hash algorithms: https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf +- Cryptographic hash functions: https://en.wikipedia.org/wiki/Cryptographic_hash_function """ __author__ = "Francisco Maria Calisto" diff --git a/src/processing/extractor.py b/src/processing/extractor.py index c8015e6..3756cbe 100644 --- a/src/processing/extractor.py +++ b/src/processing/extractor.py @@ -4,6 +4,47 @@ extractor.py: Module to extract relevant information from DICOM files. This module reads DICOM files and extracts essential metadata, including patient information, modality, and image details. +The extracted information is used for data processing and analysis. + +Key Functions: +- extract_dicom_info: Extract relevant information from a DICOM file. +- extract_patient_id: Extract the patient ID from DICOM metadata. +- extract_modality: Extract the modality (e.g., CT, MR) from DICOM metadata. +- extract_image_laterality: Extract the image laterality from DICOM metadata. +- extract_view_position: Extract the image view position from DICOM metadata. +- extract_study_date: Extract the study date from DICOM metadata. +- extract_scanning_sequence: Extract the scanning sequence from DICOM metadata. +- extract_series_description: Extract the series description from DICOM metadata. +- extract_instance_number: Extract the instance number from DICOM metadata. + +Expected Usage: +- Extract relevant information from a DICOM file using `extract_dicom_info`. +- Access specific attributes using individual extraction functions. +- Process the extracted information for analysis or visualization. + +Customization & Flexibility: +- The extraction functions can be extended to include additional attributes. +- The extracted information can be formatted or transformed based on requirements. +- The module can be integrated into existing data processing pipelines. + +Performance & Compatibility: +- The module is optimized for processing DICOM files efficiently. +- It uses the pydicom library for reading DICOM files. +- The module is compatible with Python 3.6+ and can be used in various environments and platforms. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This module is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. + +References: +- pydicom library: https://pydicom.github.io/ +- DICOM standard: https://www.dicomstandard.org/ """ __author__ = "Francisco Maria Calisto" diff --git a/src/processing/processor.py b/src/processing/processor.py index 148127f..d518f1c 100644 --- a/src/processing/processor.py +++ b/src/processing/processor.py @@ -5,6 +5,45 @@ This script processes DICOM files by anonymizing and preparing the dataset, ensuring efficient handling of large files using batch processing. +It extracts metadata from DICOM files, anonymizes patient IDs, and saves the files for further analysis. + +Key Functions: +- Process a directory of DICOM files in batches using the `process_directory` function. +- Extract metadata from DICOM files and anonymize patient IDs. +- Save anonymized DICOM files to an output directory and update the mapping file. + +Expected Input: +- A directory containing DICOM files to be processed. +- An output directory to save anonymized DICOM files. +- A mapping file to store the mapping of original and anonymized patient IDs. + +Output: +- Anonymized DICOM files saved in the output directory. +- A mapping file with the original and anonymized patient IDs for reference. + +Intended Use Case: +- This script is designed for processing large datasets of DICOM files efficiently. +- It can be used to curate and prepare medical imaging data for research or analysis. +- The script is optimized for handling memory-intensive tasks and monitoring resource usage. + +Customization & Flexibility: +- The batch size can be adjusted based on the available system resources. +- Additional logging configurations or output formats can be added for specific use cases. +- The script can be extended to support other types of medical imaging data or metadata. + +Performance & Compatibility: +- The script is optimized for performance and efficiency when processing large datasets. +- It uses memory monitoring and garbage collection to optimize resource utilization. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This script is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. """ __author__ = "Francisco Maria Calisto" diff --git a/src/utils/repeats.py b/src/utils/repeats.py index 8beeb92..ebe1475 100644 --- a/src/utils/repeats.py +++ b/src/utils/repeats.py @@ -6,13 +6,39 @@ This script processes a CSV file and identifies rows where the 'anonymized_patient_id' or 'real_patient_id' columns contain repeated values. The script then prints these rows for further analysis. -Intended Use Case: -- This script is useful for identifying duplicate entries in either the 'anonymized_patient_id' or 'real_patient_id' columns of a CSV file, - which can be critical for data cleaning, validation, or analysis tasks. +Key Functions: +- Load a CSV file containing patient mapping data. +- Identify and print rows with repeated 'anonymized_patient_id' or 'real_patient_id' values. +- Provide summary statistics on the number of repeated values and unique values. Expected Input: -- A CSV file containing the data to be analyzed. +- A CSV file containing patient mapping data with 'anonymized_patient_id' and 'real_patient_id' columns. + +Output: +- The script prints rows with repeated 'anonymized_patient_id' or 'real_patient_id' values. +- It also provides summary statistics on the number of repeated values and unique values. +Intended Use Case: +- This script is useful for identifying potential issues with patient mapping data. +- It can be used to detect duplicate or inconsistent patient IDs in a dataset. + +Customization & Flexibility: +- The script can be extended to check for repeats in additional columns or fields. +- It can be adapted to handle other types of data or metadata. + +Performance & Compatibility: +- The script is designed for performance and efficiency when processing large datasets. +- It uses the pandas library for data manipulation and analysis. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This script is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. """ __author__ = "Francisco Maria Calisto" diff --git a/src/validation/checker.py b/src/validation/checker.py index ed1d599..96dee36 100644 --- a/src/validation/checker.py +++ b/src/validation/checker.py @@ -5,6 +5,43 @@ This script compares anonymized and non-anonymized DICOM files to identify matching files based on the `SOPInstanceUID` metadata. It is optimized for large datasets by using batch processing, lazy loading of files, and memory management techniques. + +Key Functions: +- Compare anonymized and non-anonymized DICOM files in batches. +- Index non-anonymized files by `SOPInstanceUID` for fast lookup. +- Move files to the `checked` or `unsolvable` directories based on the comparison results. + +Expected Input: +- Anonymized and non-anonymized DICOM files in separate directories. +- A CSV file containing mappings from Real Patient IDs to Anonymized Patient IDs. + +Output: +- The script moves anonymized files to the `checked` directory if a match is found. +- If no match is found, the files are moved to the `unsolvable` directory. +- The script logs the progress and results of the comparison. + +Intended Use Case: +- This script is useful for validating the anonymization process and ensuring that the correct files are anonymized. +- It can be used as part of a data curation pipeline to verify the integrity of DICOM files. + +Customization & Flexibility: +- The script can be easily extended to support additional metadata fields for comparison. +- The batch size can be adjusted to optimize performance based on the available system resources. +- The script can be adapted to handle other types of medical imaging data or metadata. + +Performance & Compatibility: +- The script is designed for performance and efficiency when processing large datasets. +- It uses multiprocessing to parallelize the comparison of DICOM files and optimize resource utilization. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This script is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. """ __author__ = "Francisco Maria Calisto" diff --git a/src/validation/compare.py b/src/validation/compare.py index 329060c..db373c0 100644 --- a/src/validation/compare.py +++ b/src/validation/compare.py @@ -6,10 +6,43 @@ This script processes and compares anonymized and non-anonymized DICOM files. It supports MG (Mammography), US (Ultrasound), and MRI (Magnetic Resonance Imaging) modalities. The script matches DICOM files based on metadata such as `SOPInstanceUID`, `ViewPosition`, and `ImageLaterality`. Once matched, anonymized files are renamed and moved to a 'compared' directory for further processing. Key Functions: -- Load a mapping between anonymized and real patient IDs from a CSV file. -- Identify and validate DICOM files across different modalities. -- Extract and compare relevant metadata from anonymized and non-anonymized DICOM files. -- Rename files according to metadata and organize them into a 'compared' directory. +- Load a CSV file containing patient mapping data. +- Find all DICOM files in the 'comparing' directory. +- Index non-anonymized files by `SOPInstanceUID` for fast lookup. +- Process anonymized files, match based on `PatientID` and `SOPInstanceUID`, and rename. +- Move the matched files to the 'compared' directory for further analysis. + +Expected Input: +- Anonymized and non-anonymized DICOM files in separate directories. +- A CSV file containing mappings from Real Patient IDs to Anonymized Patient IDs. + +Output: +- The script moves anonymized files to the `compared` directory if a match is found. +- The matched files are renamed based on the `ViewPosition` and `ImageLaterality` metadata. +- The script logs the progress and results of the comparison. + +Intended Use Case: +- This script is useful for validating the anonymization process and ensuring that the correct files are anonymized. +- It can be used as part of a data curation pipeline to verify the integrity of DICOM files. + +Customization & Flexibility: +- The script can be easily extended to support additional metadata fields for comparison. +- It can be adapted to handle other types of medical imaging data or metadata. +- The script can be modified to support other modalities or imaging techniques. + +Performance & Compatibility: +- The script is designed for performance and efficiency when processing large datasets. +- It uses multiprocessing to parallelize the comparison of DICOM files and optimize resource utilization. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This script is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. """ __author__ = "Francisco Maria Calisto" diff --git a/src/validation/identifier.py b/src/validation/identifier.py index 6965395..801436b 100644 --- a/src/validation/identifier.py +++ b/src/validation/identifier.py @@ -6,10 +6,48 @@ This script processes DICOM files located in the "checking" folder by extracting the SOP Instance UID from each DICOM file. It then searches the "raw" folder and its subfolders to find a matching SOP Instance UID. If a match is found, it reads the Real Patient ID from the matching file in the "raw" folder and compares -it to the Anonymized Patient ID in the "mapping.csv" file. If a matching Anonymized Patient ID is found, +it to the Anonymized Patient ID in the "mamo_patients_mapping_data.csv" file. If a matching Anonymized Patient ID is found, the script renames the DICOM file in the "checking" folder by replacing the Patient ID part of the filename with the Anonymized Patient ID, and then moves the file to the "identified" folder. If no match is found, the DICOM file is moved to the "unsolvable" folder. Optimized for large datasets with parallel processing and efficient file handling. + +Key Functions: +- Load a CSV file containing patient mapping data. +- Load SOP Instance UIDs from the raw DICOM folder. +- Extract the Patient ID from DICOM metadata. +- Move files between folders based on processing results. +- Rename DICOM files with anonymized Patient IDs. + +Expected Input: +- DICOM files in the "checking" folder and DICOM files in the "raw" folder. +- A CSV file containing mappings from Real Patient IDs to Anonymized Patient IDs. + +Output: +- The script renames and moves DICOM files to the "identified" folder if a match is found. +- The script moves DICOM files to the "unsolvable" folder if no match is found. + +Intended Use Case: +- This script is useful for identifying and renaming DICOM files with anonymized Patient IDs. +- It can be used to validate the anonymization process and ensure that the correct files are anonymized. + +Customization & Flexibility: +- The script can be extended to handle additional metadata fields or file formats. +- It can be adapted to process other types of medical imaging data or metadata. +- The script can be integrated into a larger data curation pipeline for multimodal breast imaging data. + +Performance & Compatibility: +- The script is designed for performance and efficiency when processing large datasets. +- It uses multiprocessing to parallelize the file processing and optimize resource utilization. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This script is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. """ __author__ = "Francisco Maria Calisto" diff --git a/src/validation/laterality.py b/src/validation/laterality.py index 96c6838..75f5827 100644 --- a/src/validation/laterality.py +++ b/src/validation/laterality.py @@ -7,6 +7,44 @@ is correctly linked and organized, particularly in the context of anonymization, for both US (ultrasound) and MG (mammography) modalities. This version is optimized for massive datasets through batch processing, parallel processing, and optimized I/O operations. + +Key Functions: +- Process a directory of DICOM files in batches using the `process_directory` function. +- Extract metadata from DICOM files and check for the "Modality" and "Laterality" tags. +- Organize DICOM files based on the "Modality" and "Laterality" tags for further analysis. + +Expected Input: +- A directory containing DICOM files to be processed. +- An output directory to save organized DICOM files. +- A batch size for processing a specified number of files at a time. + +Output: +- Organized DICOM files saved in the output directory based on the "Modality" and "Laterality" tags. +- Log files with detailed information on the processing steps and results. + +Intended Use Case: +- This script is designed for processing large datasets of DICOM files efficiently. +- It can be used to organize and prepare medical imaging data for research or analysis. +- The script is optimized for handling memory-intensive tasks and monitoring resource usage. + +Customization & Flexibility: +- The batch size can be adjusted based on the available system resources. +- Additional logging configurations or output formats can be added for specific use cases. +- The script can be extended to support other types of medical imaging data or metadata. + +Performance & Compatibility: +- The script is designed for performance and efficiency when processing large datasets. +- It uses multiprocessing to parallelize the file processing and optimize resource utilization. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This script is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. """ __author__ = "Francisco Maria Calisto" diff --git a/src/validation/reanonymizer.py b/src/validation/reanonymizer.py index 4de41cf..3ff0f69 100644 --- a/src/validation/reanonymizer.py +++ b/src/validation/reanonymizer.py @@ -6,13 +6,43 @@ This script processes DICOM files by comparing anonymized and non-anonymized versions to ensure that the anonymized patient IDs are correct. It corrects any discrepancies in filenames and DICOM metadata based on predefined mappings, and then moves the corrected files to a designated 'checked' directory. This ensures the integrity and consistency of patient data, which is crucial for maintaining accurate and reliable datasets in research projects like the MIMBCD-UI initiative. Key Functions: -- Load and use mappings between real and anonymized patient IDs to validate and correct DICOM files. -- Compare SOPInstanceUID between anonymized and non-anonymized files to identify corresponding pairs. -- Update filenames and DICOM metadata to reflect the correct anonymized patient ID. -- Move processed and validated files to a designated directory for further use. +- Load a CSV file containing patient mapping data. +- Find DICOM files in specified directories. +- Extract metadata (`SOPInstanceUID` and `PatientID`) from DICOM files. +- Update DICOM metadata with corrected `PatientID` values. +- Move corrected files to a 'checked' directory for further processing. + +Expected Input: +- Anonymized and non-anonymized DICOM files in separate directories. +- A CSV file containing mappings from Real Patient IDs to Anonymized Patient IDs. + +Output: +- The script moves anonymized files to the `checked` directory if a match is found. +- The matched files are renamed based on the `ViewPosition` and `ImageLaterality` metadata. +- The script logs the progress and results of the comparison. Intended Use Case: -- This script is vital in environments where maintaining the integrity of anonymized medical imaging data is crucial. It supports the correct linkage between anonymized and non-anonymized datasets, ensuring accurate data management in research projects like the MIMBCD-UI initiative. +- This script is useful for validating the anonymization process and ensuring that the correct files are anonymized. +- It can be used as part of a data curation pipeline to verify the integrity of DICOM files. + +Customization & Flexibility: +- The script can be easily extended to support additional metadata fields for comparison. +- It can be adapted to handle other types of medical imaging data or metadata. +- The script can be integrated into automated workflows for data curation and quality control. + +Performance & Compatibility: +- The script is designed for performance and efficiency when processing large datasets. +- It uses multiprocessing to parallelize the comparison of DICOM files and optimize resource utilization. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This script is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. """ __author__ = "Francisco Maria Calisto" @@ -42,7 +72,7 @@ warnings.filterwarnings("ignore", category=NotOpenSSLWarning) # Mapping file name -mapping_fn = "mamo_patients_mapping_data_curated_21052024.csv" +mapping_fn = "mamo_patients_mapping_data.csv" # Define root directory root_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) diff --git a/src/validation/reidentified.py b/src/validation/reidentified.py index c2bc7e0..8447d6e 100644 --- a/src/validation/reidentified.py +++ b/src/validation/reidentified.py @@ -10,14 +10,46 @@ and filename accordingly, and then moves the DICOM file to the "checked" folder. If no match is found, the DICOM file is moved from the "identified" folder to the "unsolvable" folder. -Intended Use Case: -- This script ensures that DICOM files in the "identified" folder are correctly identified based on SOP Instance UID - and have their Patient IDs anonymized according to the mapping file. Files that cannot be matched are moved to the "unsolvable" folder. +Key Functions: +- Load a mapping of Real Patient ID to Anonymized Patient ID from a CSV file. +- Check if a file is a DICOM file by attempting to read it. +- Extract the SOP Instance UID from DICOM metadata. +- Search the "raw" folder and its subfolders for a matching SOP Instance UID. +- Update the Patient ID in the DICOM metadata and filename. +- Move files between folders based on processing results. Expected Input: - DICOM files in the "identified" folder and DICOM files in the "raw" folder. - A CSV file containing mappings from Real Patient IDs to Anonymized Patient IDs. +Output: +- The script updates the Patient ID in the DICOM metadata and filename. +- The script moves DICOM files to the "checked" folder if a match is found. +- The script moves DICOM files to the "unsolvable" folder if no match is found. + +Intended Use Case: +- This script is useful for reidentifying DICOM files based on the SOP Instance UID. +- It can be used to validate the anonymization process and ensure that the correct files are anonymized. +- The script is part of a data curation pipeline for multimodal breast imaging data. + +Customization & Flexibility: +- The script can be extended to handle additional metadata fields or file formats. +- It can be adapted to process other types of medical imaging data or metadata. +- The script can be integrated into a larger data curation pipeline for multimodal breast imaging data. + +Performance & Compatibility: +- The script is designed for performance and efficiency when processing large datasets. +- It uses multiprocessing to parallelize the file processing and optimize resource utilization. + +Best Practices & Maintenance: +- The script follows best practices for error handling, logging, and code readability. +- It is well-documented and can be easily maintained or extended by other developers. +- The script is designed to be robust and reliable for long-term use in data curation workflows. + +Notes: +- This script is part of a larger data curation pipeline for multimodal breast imaging data. +- It is optimized for processing DICOM files but can be adapted for other types of medical imaging data. +- The script is designed to be run from the command line or as part of an automated workflow. """ __author__ = "Francisco Maria Calisto" diff --git a/tests/test_anonymizer.py b/tests/test_anonymizer.py index d9846f5..0475ab4 100644 --- a/tests/test_anonymizer.py +++ b/tests/test_anonymizer.py @@ -4,6 +4,45 @@ test_anonymizer.py: Unit tests for anonymizing DICOM files using the anonymizer module. These tests ensure that the anonymizer functions, including metadata saving and DICOM anonymization, are working as expected. +The test cases cover the anonymization process and the creation of a mapping file. +Unit testing is achieved using the unittest framework. +The tests are run by executing this script directly. + +Key Functions: +- test_anonymize_dicom_file: Test the anonymization process of a DICOM file. +- test_save_anonymized_dicom: Test the saving of anonymized DICOM files. +- test_generate_filename_prefix: Test the generation of a filename prefix. +- test_create_mapping_file: Test the creation of a mapping file. + +Expected Usage: +- Run the test suite to verify the anonymization functionality. +- Check the test results to ensure that the functions work as expected. +- Update the tests as needed to cover additional scenarios or edge cases. + +Customization & Flexibility: +- The test cases can be extended to cover more anonymization scenarios. +- Additional tests can be added to validate specific anonymization or metadata saving scenarios. +- The test suite can be integrated into a continuous integration pipeline. + +Performance & Compatibility: +- The tests are designed to be run in a local development environment. +- The test suite is compatible with Python 3.6+ and the unittest module. +- The tests are optimized for efficiency and reliability. + +Best Practices & Maintenance: +- The test suite follows best practices for unit testing and validation. +- It provides comprehensive coverage of the anonymization functionality. +- The tests are well-documented and can be easily maintained or extended. + +Notes: +- This test suite is part of a larger data curation pipeline for medical imaging data. +- It is designed to validate the functionality of the anonymizer module. +- The tests can be run automatically using a continuous integration service. + +References: +- unittest module: https://docs.python.org/3/library/unittest.html +- pydicom library: https://pydicom.github.io/ +- DICOM standard: https://www.dicomstandard.org/ """ __author__ = "Francisco Maria Calisto" diff --git a/tests/test_encryption.py b/tests/test_encryption.py index 7208715..929f957 100644 --- a/tests/test_encryption.py +++ b/tests/test_encryption.py @@ -10,6 +10,40 @@ and that the secret phrase is correctly read from a temporary file. Unit testing is achieved using the unittest framework. +The tests are run by executing this script directly. + +Key Functions: +- test_encrypt_patient_id: Test the encryption of a patient ID. +- test_read_secret_phrase: Test the reading of a secret phrase from a temporary file. + +Expected Usage: +- Run the test suite to verify the encryption and secret phrase reading functionality. +- Check the test results to ensure that the functions work as expected. +- Update the tests as needed to cover additional scenarios or edge cases. + +Customization & Flexibility: +- The test cases can be extended to cover more encryption scenarios. +- Additional tests can be added to validate specific encryption or secret phrase reading scenarios. +- The test suite can be integrated into a continuous integration pipeline. + +Performance & Compatibility: +- The tests are designed to be run in a local development environment. +- The test suite is compatible with Python 3.6+ and the unittest module. +- The tests are optimized for efficiency and reliability. + +Best Practices & Maintenance: +- The test suite follows best practices for unit testing and validation. +- It provides comprehensive coverage of the encryption and secret phrase reading functionality. +- The tests are well-documented and can be easily maintained or extended. + +Notes: +- This test suite is part of a larger data curation pipeline for medical imaging data. +- It is designed to validate the functionality of the encryption and secret phrase reading modules. +- The tests can be run automatically using a continuous integration service. + +References: +- unittest module: https://docs.python.org/3/library/unittest.html +- hashlib module: https://docs.python.org/3/library/hashlib.html """ __author__ = "Francisco Maria Calisto" diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 8153739..8c54dc6 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -4,6 +4,47 @@ test_extractor.py: Unit tests for the DICOM information extractor module. These tests ensure that the extractor correctly handles DICOM files and properly extracts relevant metadata. + +Key Functions: +- test_extract_dicom_info: Test the extraction of metadata from a DICOM file. +- test_extract_patient_id: Test the extraction of the patient ID from DICOM metadata. +- test_extract_modality: Test the extraction of the modality (e.g., CT, MR) from DICOM metadata. +- test_extract_image_laterality: Test the extraction of the image laterality from DICOM metadata. +- test_extract_view_position: Test the extraction of the image view position from DICOM metadata. +- test_extract_study_date: Test the extraction of the study date from DICOM metadata. +- test_extract_scanning_sequence: Test the extraction of the scanning sequence from DICOM metadata. +- test_extract_series_description: Test the extraction of the series description from DICOM metadata. +- test_extract_instance_number: Test the extraction of the instance number from DICOM metadata. + +Expected Usage: +- Run the test suite to verify the functionality of the extractor module. +- Check the test results to ensure that the extraction functions work as expected. +- Update the tests as needed to cover additional scenarios or edge cases. + +Customization & Flexibility: +- The test cases can be extended to cover more attributes or metadata fields. +- Additional tests can be added to validate specific extraction scenarios. +- The test suite can be integrated into a continuous integration pipeline. + +Performance & Compatibility: +- The tests are designed to be run in a local development environment. +- The test suite is compatible with Python 3.6+ and the unittest module. +- The tests are optimized for efficiency and reliability. + +Best Practices & Maintenance: +- The test suite follows best practices for unit testing and validation. +- It provides comprehensive coverage of the extractor module functionality. +- The tests are well-documented and can be easily maintained or extended. + +Notes: +- This test suite is part of a larger data curation pipeline for medical imaging data. +- It is designed to validate the functionality of the DICOM information extractor module. +- The tests can be run automatically using a continuous integration service. + +References: +- pydicom library: https://pydicom.github.io/ +- DICOM standard: https://www.dicomstandard.org/ +- unittest module: https://docs.python.org/3/library/unittest.html """ __author__ = "Francisco Maria Calisto" diff --git a/tests/test_processor.py b/tests/test_processor.py index 6ee2799..8615134 100644 --- a/tests/test_processor.py +++ b/tests/test_processor.py @@ -5,6 +5,17 @@ These tests ensure that the DICOM batch processing script functions correctly, including anonymization, file handling, and batch processing. + +The tests use mock directories and files to simulate the processing environment +and verify the behavior of the processing functions. + +Unit testing is achieved using the unittest framework. + +Key Functions: +- setUp: Initialize the test environment with mock directories and files. +- test_process_directory_batch: Test the batch processing mechanism in process_directory. +- test_anonymization_flow: Test the full anonymization flow from metadata extraction to file anonymization. +- test_construct_filename_prefix: Test the filename prefix construction logic based on DICOM metadata. """ __author__ = "Francisco Maria Calisto" @@ -52,7 +63,7 @@ def setUp(self): # Define mock directories and files for testing self.source_folder = os.path.join(os.sep, 'mock', 'source') # Mock source directory self.output_folder = os.path.join(os.sep, 'mock', 'output') # Mock output directory for anonymized files - self.mapping_file = os.path.join(os.sep, 'mock', 'mapping.csv') # Mock mapping file for patient ID mapping + self.mapping_file = os.path.join(os.sep, 'mock', 'test_mapping.csv') # Mock mapping file for patient ID mapping self.batch_size = 2 # Small batch size for testing purposes # Ensure the mock directories exist by mocking os.makedirs