input
Output (When actually using it, please combine the following two files with ffmpeg or video editing software)
cut_movie.py
import datetime
import os
import cv2
import librosa
import numpy as np
import scipy
#A function for console output that feels good. It does not have to be.
def pretty_print_sec(sec):
int_sec = int(sec)
hour = int_sec // 3600
left_sec = int_sec - hour * 3600
minute = left_sec // 60
left_sec = left_sec - minute * 60
hour_str = ("00" + str(hour))[-2:]
min_str = ("00" + str(minute))[-2:]
sec_str = ("00" + str(left_sec))[-2:]
return ":".join([hour_str, min_str, sec_str])
#Function used to check if the target number of seconds is the target of clipping
def is_in(tuple_list, val):
for tup in tuple_list:
if tup[0] <= val <= tup[1]:
return True
return False
#Use this when cropping based on the maximum value
def cut_by_max_rms(rms, percentile):
is_on = False
start = 0
end = 0
threshold = np.percentile(rms[0], percentile)
cut_list = []
#If the previous frame was also selected for cropping, combine the cropping ranges
for i, val in enumerate(rms[0]):
if val >= threshold and is_on:
pass
elif val >= threshold and not is_on:
is_on = True
start = float(i) * 30
elif val < threshold and is_on:
end = float(i) * 30
is_on = False
cut_list.append((start, end))
else:
pass
if is_on:
cut_list.append((start, float(i + 1) * 30))
return cut_list
#Use this for maximal point base
def cut_by_local_max_rms(rms, max_frame_num):
cut_list = []
order = 1
while True:
pts = list(scipy.signal.argrelmax(rms[0], order=order)[0])
if len(pts) < max_frame_num:
break
order += 1
for point in pts:
cut_list.append((point * 30, (point + 1) * 30))
return cut_list
#Identification of cutout location
#Cut out based on the volume
def decide_cut_frames(cut_type, voice_file):
#Load audio to identify crops
#I want to make it as light as possible, so I read it at sample rate 8000
y_voice, sr_voice = librosa.load(voice_file, sr=8000, mono=True)
#Check the volume every 30 seconds
rms = librosa.feature.rms(
y=y_voice,
frame_length=sr_voice * 30,
hop_length=sr_voice * 30,
center=True,
pad_mode="reflect",
)
if cut_type == "local_max":
#The volume is maximum(Where the peak is standing)Select up to 20 frames and cut out
cut_list = cut_by_local_max_rms(rms, 20)
elif cut_type == "max":
#Top 5 loudest%Cut out the frame of
cut_list = cut_by_local_max_rms(rms, 100 - 95)
return cut_list
#Video crop
def cut_movie(cut_list, movie_file, output_movie_file):
movie = cv2.VideoCapture(movie_file)
fps = movie.get(cv2.CAP_PROP_FPS)
height = movie.get(cv2.CAP_PROP_FRAME_HEIGHT)
width = movie.get(cv2.CAP_PROP_FRAME_WIDTH)
print(fps, int(width), int(height))
#Format at output
#Note that it may change depending on the OS
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
#If it already exists, an error will occur, so delete it once.
if os.path.exists(output_movie_file):
os.remove(output_movie_file)
out = cv2.VideoWriter(
output_movie_file, fourcc, int(fps), (int(width), int(height))
)
for start, end in cut_list:
i = start * fps
movie.set(0, start * 1000)
#Read frame by frame from start and break when end is exceeded
while movie.isOpened():
sec = float(i / fps)
if sec % 60 == 0:
print(pretty_print_sec(sec), datetime.datetime.now(), flush=True)
ret, frame = movie.read()
if not ret:
break
#Add text for current time
font = cv2.FONT_HERSHEY_SIMPLEX
cv2.putText(
frame,
pretty_print_sec(sec),
(10, int(height * 0.9)),
font,
1,
(0, 255, 0),
2,
cv2.LINE_AA,
)
if is_in(cut_list, sec):
out.write(frame)
i += 1
if sec > end:
break
movie.release()
out.release()
#Audio crop
def cut_audio(cut_list, voice_file, output_audio_file):
#Note that sr will be 22050 if None is specified.
y_full, sr_full = librosa.load(voice_file, sr=None, mono=False)
output_array = [[], []]
for start, end in cut_list:
for i in range(int(start * sr_full), int(end * sr_full) + 1):
val_0 = y_full[0, i]
val_1 = y_full[1, i]
sec = float(i / sr_full)
if sec % 60 == 0:
print(pretty_print_sec(sec), datetime.datetime.now(), flush=True)
if is_in(cut_list, sec):
output_array[0].append(val_0)
output_array[1].append(val_1)
if sec > end:
break
#Fall if you don't use asfortranarray
librosa.output.write_wav(
output_audio_file, np.asfortranarray(output_array), sr_full
)
def main():
audio_file = "full.mp3" #Extracted video audio
voice_file = "voice.wav" #Extracted only voice from video
movie_file = "full.mp4"
output_audio_file = "cut.wav"
output_movie_file = "cut.mp4"
cut_type = "local_max" #Maxima base
# cut_type = "max" #Maximum value base
cut_list = decide_cut_frames(cut_type, voice_file)
cut_movie(cut_list, movie_file, output_movie_file)
cut_audio(cut_list, audio_file, output_audio_file)
if __name__ == "__main__":
main()