Skip to content
Snippets Groups Projects
Commit 793b2b0c authored by Luc Giffon's avatar Luc Giffon
Browse files

first commit: project architecture + data preprocessing script + readme

parents
No related branches found
No related tags found
No related merge requests found
# numpy data
*.npz
CENTURI Summer School/
data/raw/*
# Pycharm
.idea
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# Setting up the project
## Installation
If you can, use a new conda environment for this project. Python 3 is necessary:
conda create -n centuri_1 python=3
source activate centuri
Go to the `/code` directory then type:
pip install -e .
This will install the required dependencies and the project files.
## Prepare data
Move the directory `1 - Time Series Analysis` provided by the tutor Davide to the `data/raw` directory
and rename it `1_time_series_analysis`.
Go to the `/code/data` directory then execute the file `filter_signal_files.py` then the file `cut_signal_in_sweeps.py`.
python filter_signal_files.py
python cut_signal_in_sweeps.py
To make sure everything has been done properly, go to the `/code/vizualiation` directory then execute the `show_data_sweeps.py` file.
python show_data_sweeps.py
If there is no error, everything is fine.
# Start coding
If you want to code something new for the project.
Go to the `/code/centuri_project` directory and look at the file `load_data_example.py` to see how to load the data.
All your new code files must go under the `/code/centuri_project` directory.
from centuri_project.utils import processed_directory
import numpy as np
if __name__ == "__main__":
data_file = processed_directory / "train.npz"
data = np.load(data_file, allow_pickle=True)
data_sweepX = data["signal_times"] # for all trace the time for each value record (n x d matrix)
data_sampling_rates = data["sampling_rates"] # for all trace the sampling rate in Hz (n sized vector)
data_signal_values = data["signals"] # for all trace the value records (n x d matrix): the actual traces we need to deal with
data_labels = data["labels"] # for all trace, the event presence (as 1/0 vector), the event amplitude and the baseline (n x 3 x d cube of data)
# here happens the magic
\ No newline at end of file
import pathlib
import numpy as np
project_dir = pathlib.Path(__file__).resolve().parents[2]
raw_data_directory = project_dir / pathlib.Path("data/raw/1_time_series_analysis/EPSC/Traces & Events")
interim_directory = project_dir / pathlib.Path("data/interim")
processed_directory = project_dir / pathlib.Path("data/processed")
def save_data_to_folder(signal_times, sampling_rates, signal, labels, filename, folder_path):
np.savez(folder_path / filename,
signal_times=signal_times,
sampling_rates=sampling_rates,
signals=signal,
labels=labels)
import pyabf
import pathlib
import matplotlib.pyplot as plt
from scipy.signal import butter, lfilter, freqz, filtfilt
from scipy import fft, ifft
import pandas as pd
import numpy as np
import copy
from collections import OrderedDict
from centuri_project.utils import interim_directory, save_data_to_folder, processed_directory
if __name__ == "__main__":
filenames = ["train.npz", "test.npz"]
nb_sweep_by_trace = 9
for filename in filenames:
data_file = interim_directory / filename
data = np.load(data_file, allow_pickle=True)
lst_idx_sweeps = np.split(np.arange(data["signals"].shape[1]), nb_sweep_by_trace)
save_data_to_folder(
np.vstack([data["signal_times"][:, idxs] for idxs in lst_idx_sweeps]),
np.repeat(data["sampling_rates"], nb_sweep_by_trace),
np.vstack([data["signals"][:, idxs] for idxs in lst_idx_sweeps]),
np.vstack([data["labels"][:, :, idxs] for idxs in lst_idx_sweeps]) if filename == "train.npz" else None,
filename,
processed_directory
)
import pyabf
import pathlib
import matplotlib.pyplot as plt
from scipy.signal import butter, lfilter, freqz, filtfilt
from scipy import fft, ifft
import pandas as pd
import numpy as np
import copy
from collections import OrderedDict
from centuri_project.utils import save_data_to_folder, interim_directory, raw_data_directory
def running_mean(x, N):
"""
Compute running mean of
:param x:
:param N:
:return:
"""
cumsum = np.cumsum(np.insert(x, 0, 0))
return (cumsum[N:] - cumsum[:-N]) / float(N)
def butter_lowpass(cutoff, sampling_frequency, order=5):
"""
Create butter lowpass filter
:param cutoff: frequencies greater than cutoff will be cutted off.
:param sampling_frequency: the number of sample each second
:param order: order of the filter
:return:
"""
nyq = 0.5 * sampling_frequency # nyquist frequency
normal_cutoff = cutoff / nyq #
b, a = butter(order, normal_cutoff, btype='low', analog=False, output="ba")
return b, a
def butter_lowpass_filter(data, cutoff, fs, order=5):
"""
Apply butter lowpass filter to data
:param data: The data (amplitude: Y axis) to filter
:param cutoff:
:param fs:
:param order:
:return:
"""
b, a = butter_lowpass(cutoff, fs, order=order)
y = filtfilt(b, a, data)
return y
def band_stop_fft_filter(data, freq_range_stop, ):
fft_data = fft(data)
bandstoped_fft_data = fft_data[:]
for i in range(*freq_range_stop):
bandstoped_fft_data[i] = 0
# plt.plot(fft_data)
# plt.show()
filtered_data = ifft(bandstoped_fft_data)
return filtered_data
def process_abf_data(abf_data, ax=None):
"""
Apply filtering on abf data:
* butterworth filtering attenuate > 1000 Hz frequencies;
* fft band-stop filtering remove 50 Hz frequency.
:param abf_data: Signal amplitude values
:return: The filtered abfdata
"""
cutoff_freq_1000 = 1000 # Hz, the frequencies gt to be cuted of
order = 6 # ???
sampling_rate = abf_data.dataRate # Hz
# filter out high frequencies > 1000 Hz
filtered_abf_data_Y = butter_lowpass_filter(abf_data.sweepY, cutoff_freq_1000, sampling_rate, order)
# filter out ac frequencies (between 50 and 60 Hz)
ac_filtered_abf_data_Y = band_stop_fft_filter(filtered_abf_data_Y, (50, 51))
# remove frames of values around big spike
# indexes_values_outside_limits = ac_filtered_abf_data_Y[ac_filtered_abf_data_Y < miny or ac_filtered_abf_data_Y > maxy]
if ax is not None:
ax.plot(abf_data.sweepX, abf_data.sweepY, color="c", zorder=-2, label="raw data")
ax.plot(abf_data.sweepX, filtered_abf_data_Y, color="b", zorder=-1, label="high frequencies filtered data")
ax.plot(abf_data.sweepX, ac_filtered_abf_data_Y, color="lime", zorder=0, label="AC filtered data")
return ac_filtered_abf_data_Y
def process_asc_file(file, signal_size, signal_sampling_rate, ax=None):
"""
Read asc file in pandas dataframe and extract fields of interest:
* idx 1: time of events
* idx 2: amplitudes of events
* idx 6: baselines of events
:param file:
:return:
"""
asc_data = pd.read_csv(file, delimiter="\t", header=None)
asc_data[1] = asc_data[1].apply(lambda str_coma_float: float(str_coma_float.replace(",", "")) * 1e-3)
# get interesting fields in pandas as numpy arrays (all the same dimension)
all_events_time = asc_data[1].values
all_amplitudes_at_events = asc_data[2].values
all_baselines_at_events = asc_data[6].values
# create sparse vectors for each interesting field
events_indexes = (all_events_time * signal_sampling_rate).astype(np.int)
one_hot_event = np.zeros(signal_size)
one_hot_event[events_indexes] = 1
amplitudes_events = np.zeros(signal_size)
amplitudes_events[events_indexes] = all_amplitudes_at_events
baseline_events = np.zeros(signal_size)
baseline_events[events_indexes] = all_baselines_at_events
# concatenate those sparse vactors into a cube
cube_events = np.stack([one_hot_event, amplitudes_events, baseline_events], axis=0)
if ax is not None:
ax.scatter(all_events_time, all_baselines_at_events - all_amplitudes_at_events, edgecolors="r", facecolors="none", s=80, zorder=1, label="event")
ax.scatter(all_events_time, all_baselines_at_events, color='y', s=80, marker='x', zorder=2, label="baseline signal")
return cube_events
if __name__ == "__main__":
abf_train_files = [file for file in raw_data_directory.glob("**/*") if file.is_file()]
lst_test = {}
lst_test["signals"] = []
lst_test["signals_times"] = []
lst_test["sampling_rates"] = []
lst_train = {}
lst_train["signals"] = []
lst_train["signals_times"] = []
lst_train["cube_labels"] = []
lst_train["sampling_rates"] = []
for file in abf_train_files:
if file.suffix.upper() == ".ABF":
abf_file = file
abf_data = pyabf.ABF(abf_file)
sampling_rate = abf_data.dataRate
if sampling_rate != 10000: # when sweeps have sampling rates different than 10000, the sweep has not the same length, ignoring them atm
continue
# filter data (ac noise frequency (50 Hz) and > 1000 Hz frequencies)
filtered_abf_data_Y = process_abf_data(abf_data)
lst_test["signals"].append(filtered_abf_data_Y)
lst_test["signals_times"].append(abf_data.sweepX)
lst_test["sampling_rates"].append(sampling_rate)
elif file.suffix.upper() == ".ASC":
f, ax = plt.subplots()
# get data in abf file
abf_file = file.with_suffix('.ABF')
abf_data = pyabf.ABF(abf_file)
sampling_rate = abf_data.dataRate
# filter data (ac noise frequency (50 Hz) and > 1000 Hz frequencies)
filtered_abf_data_Y = process_abf_data(abf_data, ax)
# get the cube of dim n x d x 3 of labels. The 3 labels are: event presence with 1hot vector, amplitude of event, baseline of event
asc_file = file
cube_labels = process_asc_file(asc_file, len(filtered_abf_data_Y), sampling_rate, ax)
lst_train["signals"].append(filtered_abf_data_Y)
lst_train["signals_times"].append(abf_data.sweepX)
lst_train["cube_labels"].append(cube_labels)
lst_train["sampling_rates"].append(sampling_rate)
# vizualization settings
time_win_mean = 0.5
offset_y = 50
running_mean_win = int(time_win_mean * sampling_rate)
runing_mean_abf_dataY = running_mean(filtered_abf_data_Y, running_mean_win)
maxy = max(runing_mean_abf_dataY) + offset_y
miny = min(runing_mean_abf_dataY) - offset_y
limits = (maxy, miny)
ax.set_ylim(*limits)
f.suptitle(file.name)
handles, labels = plt.gca().get_legend_handles_labels()
by_label = OrderedDict(zip(labels, handles))
plt.legend(by_label.values(), by_label.keys())
plt.show()
else:
raise ValueError("Unknown file")
save_data_to_folder(np.stack(lst_train["signals_times"], axis=0),
np.array(lst_train["sampling_rates"]),
np.stack(lst_train["signals"], axis=0),
np.stack(lst_train["cube_labels"], axis=0),
"train", interim_directory)
save_data_to_folder(np.stack(lst_test["signals_times"], axis=0),
np.array(lst_train["sampling_rates"]),
np.stack(lst_test["signals"], axis=0),
None,
"test", interim_directory)
#!/usr/bin/env python
import os
from setuptools import setup, find_packages
import sys
NAME = 'centuri_project'
DESCRIPTION = 'spike detection'
LICENSE = 'GNU General Public License v3 (GPLv3)'
INSTALL_REQUIRES = ['numpy', 'daiquiri', 'matplotlib', 'pandas', 'keras', 'docopt', 'scipy'] # TODO to be completed
PYTHON_REQUIRES = '>=3.5'
###############################################################################
if sys.argv[-1] == 'setup.py':
print("To install, run 'python setup.py install'\n")
if sys.version_info[:2] < (3, 5):
errmsg = '{} requires Python 3.5 or later ({[0]:d}.{[1]:d} detected).'
print(errmsg.format(NAME, sys.version_info[:2]))
sys.exit(-1)
def setup_package():
"""Setup function"""
setup(name=NAME,
version=0.1,
description=DESCRIPTION,
license=LICENSE,
packages=find_packages(),
install_requires=INSTALL_REQUIRES,
python_requires=PYTHON_REQUIRES,
)
if __name__ == "__main__":
setup_package()
from centuri_project.utils import processed_directory
import numpy as np
if __name__ == "__main__":
data_file = processed_directory / "train.npz"
data = np.load(data_file, allow_pickle=True)
data_sweepX = data["signal_times"] # for all trace the time for each value record (n x d matrix)
data_sampling_rates = data["sampling_rates"] # for all trace the sampling rate in Hz (n sized vector)
data_signal_values = data["signals"] # for all trace the value records (n x d matrix): the actual traces we need to deal with
data_labels = data["labels"] # for all trace, the event presence (as 1/0 vector), the event amplitude and the baseline (n x 3 x d cube of data)
# here happens the magic
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment