Skip to content

centralelyon/sound-detection

Repository files navigation

Sound-Detection

The main objective of this project is to develop digital methods for detecting a reference sound in a video or audio file. We want to predict its starting time in the target sample.

Code and details

Results

Results

The feature_extraction method is the most performant method. We can use it for further project that use sound detection.

Code of the method :

def generate_features(file_path):
    # Load the audio file
    audio, sr = librosa.load(file_path)

    # Extract features
    chroma_stft = librosa.feature.chroma_stft(y=audio, sr=sr)
    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr)
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr)
    rms = librosa.feature.rms(y=audio)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(audio)
    # Concatenate the features into a single feature vector
    feature_vector = np.concatenate(
        (chroma_stft, spectral_centroid, spectral_bandwidth, spectral_rolloff, rms, zero_crossing_rate),
        axis=0
    )

    return feature_vector
    
def detect_sound_ref_feature_extraction(samples_path, audio_or_video_name, ref_sound_name,threshold=1000):
    # Load the target sound effect and the audio or video file
    target_sound_file = ref_sound_name
    audio_or_video_file = os.path.join(samples_path, audio_or_video_name)
    # Load the target sound effect
    target_sound, sr = librosa.load(target_sound_file)
    target_sound = np.array(target_sound)
    # Load the audio or video file
    audio, sr1 = librosa.load(audio_or_video_file)
    length_video = int(len(audio) * 1000 / sr1)

    segment_duration = int(len(target_sound) * 1000 / sr)
    # Convert the target sound effect to a spectrogram
    target_sound_feature = generate_features(target_sound_file)
    # Split the audio into short segments and compare them with the target sound effect  # Convert milliseconds to seconds)
    segment_length = segment_duration
    l = []
    for i in range(0, length_video - segment_length, segment_length):
            # Extract a segment from the audio
            segment = audio[i : i + segment_length]


            # Convert the segment to a spectrogram using librosa
            chroma_stft = librosa.feature.chroma_stft(y=segment, sr=sr1)
            spectral_centroid = librosa.feature.spectral_centroid(y=segment, sr=sr1)
            spectral_bandwidth = librosa.feature.spectral_bandwidth(y=segment, sr=sr1)
            spectral_rolloff = librosa.feature.spectral_rolloff(y=segment, sr=sr1)
            rms = librosa.feature.rms(y=segment)
            zero_crossing_rate = librosa.feature.zero_crossing_rate(segment)
    # Concatenate the features into a single feature vector
            segment_feature_vector = np.concatenate(
        (chroma_stft, spectral_centroid, spectral_bandwidth, spectral_rolloff, rms, zero_crossing_rate),
        axis=0
    )
            # Compare the features of the segment and target sound effect
            similarity = np.mean(np.abs(segment_feature_vector - target_sound_feature))
            print(similarity)

            # Set a threshold to determine if the target sound effect is present
              # Adjust this value based on your requirements

            if similarity < threshold:
                l.append(i / 1000)
    return l

About

Sound detection in videos using a reference sound

Topics

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published