"""tp3__1_-1.ipynb |
Automatically generated by Colaboratory. |
Original file is located at |
https://colab.research.google.com/drive/1_Sjx5G1BW689ggZJAJ4P7kCZndOobNCp |
""" |
import gradio as gr |
import time |
import tensorflow as tf |
import tensorflow_hub as hub |
import numpy as np |
import matplotlib.pyplot as plt |
import librosa |
from librosa import display as librosadisplay |
import logging |
import math |
import statistics |
import sys |
from IPython.display import Audio, Javascript |
from scipy.io import wavfile |
from base64 import b64decode |
import music21 |
from pydub import AudioSegment |
logger = logging.getLogger() |
logger.setLevel(logging.ERROR) |
def convert_audio_for_model(user_file, output_file='converted_audio_file.wav'): |
audio = AudioSegment.from_file(user_file) |
audio = audio.set_frame_rate(EXPECTED_SAMPLE_RATE).set_channels(1) |
audio.export(output_file, format="wav") |
return output_file |
MAX_ABS_INT16 = 32768.0 |
def plot_stft(x, sample_rate, show_black_and_white=False): |
x_stft = np.abs(librosa.stft(x, n_fft=2048)) |
fig, ax = plt.subplots() |
fig.set_size_inches(20, 10) |
x_stft_db = librosa.amplitude_to_db(x_stft, ref=np.max) |
if(show_black_and_white): |
librosadisplay.specshow(data=x_stft_db, |
y_axis='log', |
sr=sample_rate, |
cmap='gray_r') |
else: |
librosadisplay.specshow(data=x_stft_db, |
y_axis='log', |
sr=sample_rate) |
plt.colorbar(format='%+2.0f dB') |
return fig |
model = hub.load("https://tfhub.dev/google/spice/2") |
def plot_pitch_conf(pitch_outputs,confidence_outputs): |
fig, ax = plt.subplots() |
fig.set_size_inches(20, 10) |
plt.plot(pitch_outputs, label='pitch') |
plt.plot(confidence_outputs, label='confidence') |
plt.legend(loc="lower right") |
return fig |
def plot_pitch_conf_notes(confident_pitch_outputs_x,confident_pitch_outputs_y): |
fig, ax = plt.subplots() |
fig.set_size_inches(20, 10) |
ax.set_ylim([0, 1]) |
plt.scatter(confident_pitch_outputs_x, confident_pitch_outputs_y, ) |
plt.scatter(confident_pitch_outputs_x, confident_pitch_outputs_y, c="r") |
return fig |
def output2hz(pitch_output): |
PT_OFFSET = 25.58 |
PT_SLOPE = 63.07 |
FMIN = 10.0; |
cqt_bin = pitch_output * PT_SLOPE + PT_OFFSET; |
return FMIN * 2.0 ** (1.0 * cqt_bin / BINS_PER_OCTAVE) |
def espectro_notas(audio_samples,EXPECTED_SAMPLE_RATE,confident_pitch_outputs_x,confident_pitch_values_hz): |
fig, ax = plt.subplots() |
plot_stft(audio_samples / MAX_ABS_INT16 , |
sample_rate=EXPECTED_SAMPLE_RATE, show_black_and_white=True) |
plt.scatter(confident_pitch_outputs_x, confident_pitch_values_hz, c="r") |
return fig |
def hz2offset(freq): |
if freq == 0: |
return None |
h = round(12 * math.log2(freq / C0)) |
return 12 * math.log2(freq / C0) - h |
def quantize_predictions(group, ideal_offset): |
non_zero_values = [v for v in group if v != 0] |
zero_values_count = len(group) - len(non_zero_values) |
if zero_values_count > 0.8 * len(group): |
return 0.51 * len(non_zero_values), "Rest" |
else: |
h = round( |
statistics.mean([ |
12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values |
])) |
octave = h // 12 |
n = h % 12 |
note = note_names[n] + str(octave) |
error = sum([ |
abs(12 * math.log2(freq / C0) - ideal_offset - h) |
for freq in non_zero_values |
]) |
return error, note |
def get_quantization_and_error(pitch_outputs_and_rests, predictions_per_eighth, |
prediction_start_offset, ideal_offset): |
pitch_outputs_and_rests = [0] * prediction_start_offset + \ |
pitch_outputs_and_rests |
groups = [ |
pitch_outputs_and_rests[i:i + predictions_per_eighth] |
for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth) |
] |
quantization_error = 0 |
notes_and_rests = [] |
for group in groups: |
error, note_or_rest = quantize_predictions(group, ideal_offset) |
quantization_error += error |
notes_and_rests.append(note_or_rest) |
return quantization_error, notes_and_rests |
def main(audio): |
converted_audio_file = convert_audio_for_model(audio) |
sample_rate, audio_samples = wavfile.read(converted_audio_file, 'rb') |
audio_samples = audio_samples / float(MAX_ABS_INT16) |
model_output = model.signatures["serving_default"](tf.constant(audio_samples, tf.float32)) |
pitch_outputs = model_output["pitch"] |
uncertainty_outputs = model_output["uncertainty"] |
confidence_outputs = 1.0 - uncertainty_outputs |
confidence_outputs = list(confidence_outputs) |
pitch_outputs = [ float(x) for x in pitch_outputs] |
indices = range(len (pitch_outputs)) |
confident_pitch_outputs = [ (i,p) |
for i, p, c in zip(indices, pitch_outputs, confidence_outputs) if c >= 0.9 ] |
confident_pitch_outputs_x, confident_pitch_outputs_y = zip(*confident_pitch_outputs) |
confident_pitch_values_hz = [ output2hz(p) for p in confident_pitch_outputs_y ] |
fig1 = plt.figure() |
plt.plot(audio_samples) |
fig2 = plot_stft(audio_samples / MAX_ABS_INT16 , sample_rate=EXPECTED_SAMPLE_RATE) |
fig3 = plot_pitch_conf(pitch_outputs,confidence_outputs) |
fig4 = plot_pitch_conf_notes(confident_pitch_outputs_x,confident_pitch_outputs_y) |
fig5 = espectro_notas(audio_samples,EXPECTED_SAMPLE_RATE,confident_pitch_outputs_x,confident_pitch_values_hz) |
pitch_outputs_and_rests = [ |
output2hz(p) if c >= 0.9 else 0 |
for i, p, c in zip(indices, pitch_outputs, confidence_outputs) |
] |
A4 = 440 |
C0 = A4 * pow(2, -4.75) |
note_names = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] |
def hz2offset(freq): |
if freq == 0: |
return None |
h = round(12 * math.log2(freq / C0)) |
return 12 * math.log2(freq / C0) - h |
offsets = [hz2offset(p) for p in pitch_outputs_and_rests if p != 0] |
off = offsets |
ideal_offset = statistics.mean(offsets) |
ideal_off = ideal_offset |
def quantize_predictions(group, ideal_offset): |
non_zero_values = [v for v in group if v != 0] |
zero_values_count = len(group) - len(non_zero_values) |
if zero_values_count > 0.8 * len(group): |
return 0.51 * len(non_zero_values), "Rest" |
else: |
h = round( |
statistics.mean([ |
12 * math.log2(freq / C0) - ideal_offset for freq in non_zero_values |
])) |
octave = h // 12 |
n = h % 12 |
note = note_names[n] + str(octave) |
error = sum([ |
abs(12 * math.log2(freq / C0) - ideal_offset - h) |
for freq in non_zero_values |
]) |
return error, note |
def get_quantization_and_error(pitch_outputs_and_rests, predictions_per_eighth, |
prediction_start_offset, ideal_offset): |
pitch_outputs_and_rests = [0] * prediction_start_offset + \ |
pitch_outputs_and_rests |
groups = [ |
pitch_outputs_and_rests[i:i + predictions_per_eighth] |
for i in range(0, len(pitch_outputs_and_rests), predictions_per_eighth) |
] |
quantization_error = 0 |
notes_and_rests = [] |
for group in groups: |
error, note_or_rest = quantize_predictions(group, ideal_offset) |
quantization_error += error |
notes_and_rests.append(note_or_rest) |
return quantization_error, notes_and_rests |
best_error = float("inf") |
best_notes_and_rests = None |
best_predictions_per_note = None |
for predictions_per_note in range(20, 65, 1): |
for prediction_start_offset in range(predictions_per_note): |
error, notes_and_rests = get_quantization_and_error( |
pitch_outputs_and_rests, predictions_per_note, |
prediction_start_offset, ideal_offset) |
if error < best_error: |
best_error = error |
best_notes_and_rests = notes_and_rests |
best_predictions_per_note = predictions_per_note |
sc = music21.stream.Score() |
bpm = 60 * 60 / best_predictions_per_note |
a = music21.tempo.MetronomeMark(number=bpm) |
sc.insert(0,a) |
for snote in best_notes_and_rests: |
d = 'half' |
if snote == 'Rest': |
sc.append(music21.note.Rest(type=d)) |
else: |
sc.append(music21.note.Note(snote, type=d)) |
from IPython.core.display import HTML, Javascript |
from IPython import display |
import json, random |
def showScore(score): |
xml = open(score.write('musicxml')).read() |
showMusicXML(xml) |
def showMusicXML(xml): |
DIV_ID = "OSMD_div" |
a = display(HTML('<div id="'+DIV_ID+'">loading OpenSheetMusicDisplay</div>')) |
script = """ |
var div_id = {{DIV_ID}}; |
function loadOSMD() { |
return new Promise(function(resolve, reject){ |
if (window.opensheetmusicdisplay) { |
return resolve(window.opensheetmusicdisplay) |
} |
// OSMD script has a 'define' call which conflicts with requirejs |
var _define = window.define // save the define object |
window.define = undefined // now the loaded script will ignore requirejs |
var s = document.createElement( 'script' ); |
s.setAttribute( 'src', "https://cdn.jsdelivr.net/npm/[email protected]/build/opensheetmusicdisplay.min.js" ); |
//s.setAttribute( 'src', "/custom/opensheetmusicdisplay.js" ); |
s.onload=function(){ |
window.define = _define |
resolve(opensheetmusicdisplay); |
}; |
document.body.appendChild( s ); // browser will try to load the new script tag |
}) |
} |
loadOSMD().then((OSMD)=>{ |
window.openSheetMusicDisplay = new OSMD.OpenSheetMusicDisplay(div_id, { |
drawingParameters: "compacttight" |
}); |
openSheetMusicDisplay |
.load({{data}}) |
.then( |
function() { |
openSheetMusicDisplay.render(); |
} |
); |
}) |
""".replace('{{DIV_ID}}',DIV_ID).replace('{{data}}',json.dumps(xml)) |
display(Javascript(script)) |
return a |
return converted_audio_file, fig1, fig2, fig3, fig4,fig5, bpm, best_notes_and_rests |
link = "https://www.tensorflow.org/hub/tutorials/spice?hl=es-419&authuser=2" |
iface = gr.Interface( |
fn=main, |
title= "Trabajo Práctico N°3 - Detección de tono con SPICE", |
description="Implementación de Modelo con GitHub + Hugging Face🤗-- 🔊✅ " + "Basado en: " + link, |
inputs = [gr.inputs.Audio(source= "microphone" , type="filepath",label="Ingrese Audio")], |
outputs= [gr.outputs.Audio(label="Audio Original"), |
gr.outputs.Plot(type="auto",label="Gráfico de Frecuencias"), |
gr.outputs.Plot(type="auto",label="Especto"), |
gr.outputs.Plot(type="auto",label="Pitch Confidence"), |
gr.outputs.Plot(type="auto",label="Notas"), |
gr.outputs.Plot(type="auto",label="Espectro+Notas"), |
gr.outputs.Textbox(label="bpm"), |
gr.outputs.Textbox(label="partitura")], |
interpretation = "default", |
) |
iface.launch(debug=True) |