Thing you want to do

When shooting a live video, it is difficult to find the shooting height from the video data that lasts for several hours, so I want to do something about it.

approach

Based on the hypothesis that the yield = the rising point = the loud part, the loud part was extracted.
Looking at the actual data, there was a tendency for the overall voice volume to decrease in the second half (being tired?), So I decided to use the maximum value (peak) instead of the absolute value of the volume.
The code below allows you to switch.

Relation

An approach that focuses on the volume and extracts the exciting points is also proposed below.
- https://qiita.com/120byte/items/79a018f73db0544f5631
The moviepy used in this article didn't work at hand, so this time I tried to build it with librosa and opencv.

Source

input

voice.wav: Voice-only data (not including background sound)
full.mp3: Audio + background sound
full.mp4: Video (with or without audio)

Output (When actually using it, please combine the following two files with ffmpeg or video editing software)

cut.wav: Audio data
cut.mp4: Video without audio

`cut_movie.py`


import datetime
import os

import cv2
import librosa
import numpy as np
import scipy


#A function for console output that feels good. It does not have to be.
def pretty_print_sec(sec):
    int_sec = int(sec)

    hour = int_sec // 3600
    left_sec = int_sec - hour * 3600
    minute = left_sec // 60
    left_sec = left_sec - minute * 60

    hour_str = ("00" + str(hour))[-2:]
    min_str = ("00" + str(minute))[-2:]
    sec_str = ("00" + str(left_sec))[-2:]

    return ":".join([hour_str, min_str, sec_str])


#Function used to check if the target number of seconds is the target of clipping
def is_in(tuple_list, val):
    for tup in tuple_list:
        if tup[0] <= val <= tup[1]:
            return True

    return False


#Use this when cropping based on the maximum value
def cut_by_max_rms(rms, percentile):

    is_on = False

    start = 0
    end = 0

    threshold = np.percentile(rms[0], percentile)
    cut_list = []

    #If the previous frame was also selected for cropping, combine the cropping ranges
    for i, val in enumerate(rms[0]):
        if val >= threshold and is_on:
            pass
        elif val >= threshold and not is_on:
            is_on = True
            start = float(i) * 30
        elif val < threshold and is_on:
            end = float(i) * 30
            is_on = False
            cut_list.append((start, end))
        else:
            pass

    if is_on:
        cut_list.append((start, float(i + 1) * 30))

    return cut_list


#Use this for maximal point base
def cut_by_local_max_rms(rms, max_frame_num):

    cut_list = []

    order = 1
    while True:
        pts = list(scipy.signal.argrelmax(rms[0], order=order)[0])

        if len(pts) < max_frame_num:
            break

        order += 1

    for point in pts:
        cut_list.append((point * 30, (point + 1) * 30))

    return cut_list


#Identification of cutout location
#Cut out based on the volume
def decide_cut_frames(cut_type, voice_file):
    #Load audio to identify crops
    #I want to make it as light as possible, so I read it at sample rate 8000
    y_voice, sr_voice = librosa.load(voice_file, sr=8000, mono=True)

    #Check the volume every 30 seconds
    rms = librosa.feature.rms(
        y=y_voice,
        frame_length=sr_voice * 30,
        hop_length=sr_voice * 30,
        center=True,
        pad_mode="reflect",
    )

    if cut_type == "local_max":
        #The volume is maximum(Where the peak is standing)Select up to 20 frames and cut out
        cut_list = cut_by_local_max_rms(rms, 20)
    elif cut_type == "max":
        #Top 5 loudest%Cut out the frame of
        cut_list = cut_by_local_max_rms(rms, 100 - 95)

    return cut_list


#Video crop
def cut_movie(cut_list, movie_file, output_movie_file):

    movie = cv2.VideoCapture(movie_file)
    fps = movie.get(cv2.CAP_PROP_FPS)
    height = movie.get(cv2.CAP_PROP_FRAME_HEIGHT)
    width = movie.get(cv2.CAP_PROP_FRAME_WIDTH)
    print(fps, int(width), int(height))

    #Format at output
    #Note that it may change depending on the OS
    fourcc = cv2.VideoWriter_fourcc(*"mp4v")

    #If it already exists, an error will occur, so delete it once.
    if os.path.exists(output_movie_file):
        os.remove(output_movie_file)

    out = cv2.VideoWriter(
        output_movie_file, fourcc, int(fps), (int(width), int(height))
    )

    for start, end in cut_list:
        i = start * fps
        movie.set(0, start * 1000)

        #Read frame by frame from start and break when end is exceeded
        while movie.isOpened():
            sec = float(i / fps)
            if sec % 60 == 0:
                print(pretty_print_sec(sec), datetime.datetime.now(), flush=True)

            ret, frame = movie.read()
            if not ret:
                break

            #Add text for current time
            font = cv2.FONT_HERSHEY_SIMPLEX
            cv2.putText(
                frame,
                pretty_print_sec(sec),
                (10, int(height * 0.9)),
                font,
                1,
                (0, 255, 0),
                2,
                cv2.LINE_AA,
            )

            if is_in(cut_list, sec):
                out.write(frame)

            i += 1
            if sec > end:
                break

    movie.release()
    out.release()


#Audio crop
def cut_audio(cut_list, voice_file, output_audio_file):

    #Note that sr will be 22050 if None is specified.
    y_full, sr_full = librosa.load(voice_file, sr=None, mono=False)

    output_array = [[], []]
    for start, end in cut_list:
        for i in range(int(start * sr_full), int(end * sr_full) + 1):
            val_0 = y_full[0, i]
            val_1 = y_full[1, i]

            sec = float(i / sr_full)
            if sec % 60 == 0:
                print(pretty_print_sec(sec), datetime.datetime.now(), flush=True)

            if is_in(cut_list, sec):
                output_array[0].append(val_0)
                output_array[1].append(val_1)

            if sec > end:
                break

    #Fall if you don't use asfortranarray
    librosa.output.write_wav(
        output_audio_file, np.asfortranarray(output_array), sr_full
    )


def main():
    audio_file = "full.mp3"  #Extracted video audio
    voice_file = "voice.wav"  #Extracted only voice from video
    movie_file = "full.mp4"

    output_audio_file = "cut.wav"
    output_movie_file = "cut.mp4"

    cut_type = "local_max"  #Maxima base
    # cut_type = "max" #Maximum value base

    cut_list = decide_cut_frames(cut_type, voice_file)
    cut_movie(cut_list, movie_file, output_movie_file)
    cut_audio(cut_list, audio_file, output_audio_file)


if __name__ == "__main__":
    main()

Remarks

The code is based on the assumption that the sound and the background sound (such as game sound) are separated.
Therefore, the extraction of the yield (so-called clipping) from the video that has already been distributed is not covered by this code because the audio and background sound cannot be separated.
However, I think that the high-pitched part tends to make a loud noise, so a certain effect may be obtained.

[PYTHON] I want to automatically find high-quality parts from the videos I shot

Thing you want to do

approach

Relation

Source

cut_movie.py

Remarks

`cut_movie.py`