main
Michael Murtaugh 3 months ago
parent 738d07dc3e
commit e4fcd42f23

1
.gitignore vendored

@ -1,4 +1,3 @@
*.mp3
*~
# python stuff
__pycache__/

@ -0,0 +1,2 @@
*.mp3
*.wav

@ -1,6 +1,9 @@
#!/bin/bash
#
# vosk can also output JSON that includes the timing of each individual
# detected WORD!
# NOTE: I had an error when I did this and needed to PATCH some PYTHON code in VOSK
# see VOSKPATCH.TXT
vosk-transcriber -l en-us -i worm25_mia_60.wav -t json -o worm25_mia_60.json
vosk-transcriber -l en-us -i w25mia60.wav -t json -o w25mia60.json

@ -0,0 +1,96 @@
# This file is part of Active Archives.
# Copyright 2006-2016 the Active Archives contributors (see AUTHORS)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Also add information on how to contact you by electronic and paper mail.
from __future__ import print_function
import math
import re
# timecode_pat = re.compile(r"(\d+):(\d+):(\d+)(?:[.,](\d+))?")
timecode_pat = re.compile(r"(?:(\d+):)?(\d+):(\d+)(?:[.,](\d+))?")
def timecode_fromsecs(rawsecs, fract=True, alwaysfract=False, fractdelim=',', alwayshours=False):
# returns a string in HH:MM:SS[.xxx] notation
# if fract is True, uses .xxx if either necessary (non-zero)
# OR alwaysfract is True
hours = math.floor(rawsecs / 3600)
rawsecs -= hours * 3600
mins = math.floor(rawsecs / 60)
rawsecs -= mins * 60
if fract:
secs = math.floor(rawsecs)
rawsecs -= secs
if (rawsecs > 0 or alwaysfract):
fract = "%.03f" % rawsecs
if hours or alwayshours:
return "%02d:%02d:%02d%s%s" % (hours, mins, secs, fractdelim, \
fract[2:])
else:
return "%02d:%02d%s%s" % (mins, secs, fractdelim, fract[2:])
else:
if hours or alwayshours:
return "%02d:%02d:%02d" % (hours, mins, secs)
else:
return "%02d:%02d" % (mins, secs)
else:
secs = round(rawsecs)
if hours or alwayshours:
return "%02d:%02d:%02d" % (hours, mins, secs)
else:
return "%02d:%02d" % (mins, secs)
def timecode_tosecs(tcstr):
r = timecode_pat.search(tcstr)
if r:
ret = 0
if r.group(1):
ret += 3600 * int(r.group(1))
ret += 60 * int(r.group(2))
ret += int(r.group(3))
if (r.group(4)):
ret = float(str(ret) + "." + r.group(4))
return ret
else:
return None
def parse2secs(val):
try:
return float(val)
except ValueError:
return timecode_tosecs(val)
## to accept None
# except TypeError:
# return
if __name__ == "__main__":
def t(x):
# with fraction
s = timecode_fromsecs(x, True, False)
print (x, "=>", s, "=>", timecode_tosecs(s))
# without fraction
s = timecode_fromsecs(x, False)
print (x, "=>", s, "=>", timecode_tosecs(s))
t(0)
t(59.666666666666666)
t(60)
t(60.0)
t(1235 / 3.0)
t(10000.5)

@ -0,0 +1,30 @@
#!/usr/bin/env python
import json
import argparse
import sys
# requires: timecode.py
# some functions to help working with (srt/vtt) timecodes
from timecode import timecode_fromsecs
ap = argparse.ArgumentParser("convert VOSK json output into a special vtt with timed json per word")
ap.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
ap.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
args = ap.parse_args()
data = json.load(args.infile)
def tc(s):
return timecode_fromsecs(s, alwaysfract=True, fractdelim=".")
print ("WEBVTT", file=args.outfile)
print (file=args.outfile)
for m in data['monologues']:
for term in m['terms']:
if term['type'] == "WORD":
print (f"{tc(term['start'])} --> {tc(term['end'])}", file=args.outfile)
# print (f"{term['text']}")
tterm = {'text': term['text'], 'confidence': term['confidence']}
print (json.dumps(tterm), file=args.outfile)
print (file=args.outfile)

@ -0,0 +1,56 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
<style>
#word {
position:relative;
text-align: center;
}
#word .content {
font-size: 100px;
color: black;
}
#word .content.iffy {
color: lightpink;
}
</style>
</head>
<body>
<audio controls style="width: 100%">
<track kind="metadata" id="metadata" label="captions" src="w25mia60_words.vtt"></track>
<source src="w25mia60.mp3">
</audio>
<div id="word">
<div class="content">hello</div>
</div>
<script>
const word_content = document.querySelector("#word .content");
const track = document.querySelector("track#metadata");
track.addEventListener("cuechange", function (e) {
// console.log(`track: cuechange: ${this}`);
if (this.track.activeCues) {
let word = this.track.activeCues[0]?.text;
console.log("word", word);
if (word) {
word = JSON.parse(word);
word_content.innerText = word['text'];
word_content.style.fontSize = (100 * word['confidence']) + "px";
if (word['confidence'] < 1.0 && !word_content.classList.contains("iffy")) {
word_content.classList.add("iffy");
} else if (word_content.classList.contains("iffy")) {
word_content.classList.remove("iffy");
}
}
}
});
track.track.mode = "hidden";
</script>
</body>
</html>

File diff suppressed because one or more lines are too long

@ -0,0 +1,518 @@
WEBVTT
00:00.076 --> 00:00.270
{"text": "have", "confidence": 1.0}
00:00.270 --> 00:00.330
{"text": "a", "confidence": 1.0}
00:00.330 --> 00:00.870
{"text": "particular", "confidence": 1.0}
00:00.870 --> 00:01.560
{"text": "connection", "confidence": 1.0}
00:01.650 --> 00:01.800
{"text": "to", "confidence": 1.0}
00:01.800 --> 00:02.190
{"text": "worms", "confidence": 1.0}
00:02.250 --> 00:02.610
{"text": "inner", "confidence": 1.0}
00:02.610 --> 00:02.940
{"text": "and", "confidence": 1.0}
00:02.970 --> 00:03.330
{"text": "outer", "confidence": 1.0}
00:03.390 --> 00:04.140
{"text": "workings", "confidence": 1.0}
00:04.980 --> 00:05.070
{"text": "i", "confidence": 1.0}
00:05.070 --> 00:05.340
{"text": "wanted", "confidence": 1.0}
00:05.340 --> 00:05.700
{"text": "to", "confidence": 1.0}
00:05.730 --> 00:05.970
{"text": "make", "confidence": 1.0}
00:05.970 --> 00:06.060
{"text": "the", "confidence": 1.0}
00:06.060 --> 00:06.600
{"text": "series", "confidence": 1.0}
00:06.600 --> 00:07.110
{"text": "because", "confidence": 1.0}
00:07.140 --> 00:07.440
{"text": "one", "confidence": 0.715959}
00:07.440 --> 00:07.590
{"text": "has", "confidence": 1.0}
00:07.590 --> 00:08.190
{"text": "become", "confidence": 1.0}
00:08.280 --> 00:08.640
{"text": "over", "confidence": 1.0}
00:08.640 --> 00:08.940
{"text": "twenty", "confidence": 1.0}
00:08.940 --> 00:09.210
{"text": "five", "confidence": 1.0}
00:09.210 --> 00:09.690
{"text": "years", "confidence": 1.0}
00:09.690 --> 00:09.870
{"text": "and", "confidence": 0.868049}
00:09.870 --> 00:10.710
{"text": "institution", "confidence": 1.0}
00:11.460 --> 00:11.610
{"text": "but", "confidence": 1.0}
00:11.610 --> 00:11.700
{"text": "an", "confidence": 1.0}
00:11.700 --> 00:12.360
{"text": "institution", "confidence": 1.0}
00:12.360 --> 00:12.750
{"text": "built", "confidence": 1.0}
00:13.140 --> 00:13.495
{"text": "with", "confidence": 1.0}
00:13.500 --> 00:13.620
{"text": "and", "confidence": 0.908873}
00:13.620 --> 00:14.130
{"text": "for", "confidence": 0.664659}
00:14.130 --> 00:14.280
{"text": "and", "confidence": 1.0}
00:14.280 --> 00:14.850
{"text": "from", "confidence": 1.0}
00:14.880 --> 00:14.970
{"text": "a", "confidence": 1.0}
00:14.970 --> 00:15.120
{"text": "d", "confidence": 1.0}
00:15.120 --> 00:15.240
{"text": "i", "confidence": 1.0}
00:15.240 --> 00:15.480
{"text": "y", "confidence": 1.0}
00:15.480 --> 00:16.110
{"text": "spirits", "confidence": 1.0}
00:16.590 --> 00:16.740
{"text": "and", "confidence": 0.821086}
00:16.740 --> 00:17.160
{"text": "something", "confidence": 1.0}
00:17.220 --> 00:17.400
{"text": "that", "confidence": 1.0}
00:17.400 --> 00:17.460
{"text": "it", "confidence": 1.0}
00:17.460 --> 00:17.670
{"text": "still", "confidence": 1.0}
00:17.670 --> 00:18.390
{"text": "retains", "confidence": 1.0}
00:18.900 --> 00:19.080
{"text": "it's", "confidence": 1.0}
00:19.080 --> 00:19.170
{"text": "the", "confidence": 1.0}
00:19.170 --> 00:19.470
{"text": "nature", "confidence": 1.0}
00:19.470 --> 00:19.560
{"text": "of", "confidence": 1.0}
00:19.560 --> 00:19.710
{"text": "do", "confidence": 1.0}
00:19.710 --> 00:19.800
{"text": "i", "confidence": 1.0}
00:19.800 --> 00:20.018
{"text": "was", "confidence": 1.0}
00:20.018 --> 00:20.490
{"text": "spaces", "confidence": 0.697723}
00:20.490 --> 00:20.640
{"text": "of", "confidence": 1.0}
00:20.640 --> 00:21.060
{"text": "course", "confidence": 1.0}
00:21.060 --> 00:21.240
{"text": "and", "confidence": 1.0}
00:21.240 --> 00:21.420
{"text": "any", "confidence": 1.0}
00:21.420 --> 00:21.810
{"text": "cultural", "confidence": 1.0}
00:21.810 --> 00:22.410
{"text": "spaces", "confidence": 1.0}
00:22.470 --> 00:22.710
{"text": "that", "confidence": 1.0}
00:22.710 --> 00:23.040
{"text": "people", "confidence": 1.0}
00:23.040 --> 00:23.190
{"text": "who", "confidence": 1.0}
00:23.190 --> 00:23.490
{"text": "make", "confidence": 1.0}
00:23.490 --> 00:23.610
{"text": "it", "confidence": 1.0}
00:23.610 --> 00:23.880
{"text": "come", "confidence": 1.0}
00:23.880 --> 00:23.970
{"text": "and", "confidence": 1.0}
00:23.970 --> 00:24.390
{"text": "go", "confidence": 1.0}
00:24.570 --> 00:24.690
{"text": "the", "confidence": 1.0}
00:24.690 --> 00:24.900
{"text": "name", "confidence": 1.0}
00:24.900 --> 00:25.020
{"text": "on", "confidence": 1.0}
00:25.020 --> 00:25.080
{"text": "the", "confidence": 1.0}
00:25.080 --> 00:25.410
{"text": "building", "confidence": 1.0}
00:25.410 --> 00:25.740
{"text": "stays", "confidence": 1.0}
00:25.740 --> 00:25.830
{"text": "the", "confidence": 1.0}
00:25.830 --> 00:26.160
{"text": "same", "confidence": 1.0}
00:26.160 --> 00:26.370
{"text": "but", "confidence": 1.0}
00:26.370 --> 00:26.460
{"text": "the", "confidence": 1.0}
00:26.460 --> 00:26.880
{"text": "vibe", "confidence": 1.0}
00:26.970 --> 00:27.630
{"text": "changes", "confidence": 1.0}
00:28.020 --> 00:28.200
{"text": "and", "confidence": 1.0}
00:28.200 --> 00:28.260
{"text": "the", "confidence": 1.0}
00:28.260 --> 00:28.560
{"text": "sense", "confidence": 1.0}
00:28.560 --> 00:28.680
{"text": "of", "confidence": 1.0}
00:28.680 --> 00:29.190
{"text": "identity", "confidence": 1.0}
00:29.190 --> 00:29.880
{"text": "shifts", "confidence": 1.0}
00:31.020 --> 00:31.440
{"text": "worm", "confidence": 1.0}
00:31.470 --> 00:31.710
{"text": "is", "confidence": 1.0}
00:31.710 --> 00:31.770
{"text": "a", "confidence": 1.0}
00:31.770 --> 00:32.070
{"text": "place", "confidence": 1.0}
00:32.070 --> 00:32.190
{"text": "of", "confidence": 1.0}
00:32.190 --> 00:32.850
{"text": "invention", "confidence": 1.0}
00:32.850 --> 00:33.000
{"text": "and", "confidence": 1.0}
00:33.000 --> 00:33.810
{"text": "reinvention", "confidence": 1.0}
00:33.810 --> 00:34.170
{"text": "and", "confidence": 1.0}
00:34.230 --> 00:34.410
{"text": "with", "confidence": 0.722413}
00:34.410 --> 00:34.500
{"text": "the", "confidence": 1.0}
00:34.500 --> 00:34.920
{"text": "constant", "confidence": 1.0}
00:34.920 --> 00:35.460
{"text": "motion", "confidence": 1.0}
00:35.520 --> 00:35.880
{"text": "certain", "confidence": 1.0}
00:35.880 --> 00:36.390
{"text": "stories", "confidence": 1.0}
00:36.390 --> 00:36.510
{"text": "and", "confidence": 1.0}
00:36.510 --> 00:37.050
{"text": "memories", "confidence": 1.0}
00:37.080 --> 00:37.380
{"text": "do", "confidence": 0.854091}
00:37.410 --> 00:37.680
{"text": "fade", "confidence": 1.0}
00:37.680 --> 00:38.070
{"text": "away", "confidence": 1.0}
00:38.640 --> 00:39.120
{"text": "sometimes", "confidence": 1.0}
00:39.120 --> 00:39.330
{"text": "quite", "confidence": 1.0}
00:39.330 --> 00:39.870
{"text": "quickly", "confidence": 1.0}
00:40.770 --> 00:40.890
{"text": "the", "confidence": 1.0}
00:40.890 --> 00:41.160
{"text": "idea", "confidence": 1.0}
00:41.160 --> 00:41.310
{"text": "of", "confidence": 1.0}
00:41.340 --> 00:41.730
{"text": "this", "confidence": 1.0}
00:41.790 --> 00:42.240
{"text": "history", "confidence": 1.0}
00:42.240 --> 00:42.362
{"text": "of", "confidence": 1.0}
00:42.362 --> 00:42.564
{"text": "why", "confidence": 0.478323}
00:42.564 --> 00:42.780
{"text": "i'm", "confidence": 1.0}
00:42.870 --> 00:43.080
{"text": "not", "confidence": 1.0}
00:43.080 --> 00:43.200
{"text": "the", "confidence": 1.0}
00:43.260 --> 00:43.620
{"text": "only", "confidence": 1.0}
00:43.650 --> 00:44.130
{"text": "possible", "confidence": 1.0}
00:44.160 --> 00:44.640
{"text": "history", "confidence": 1.0}
00:44.670 --> 00:44.790
{"text": "of", "confidence": 1.0}
00:44.790 --> 00:45.270
{"text": "course", "confidence": 1.0}
00:45.330 --> 00:45.510
{"text": "is", "confidence": 1.0}
00:45.510 --> 00:45.630
{"text": "to", "confidence": 1.0}
00:45.630 --> 00:45.930
{"text": "try", "confidence": 1.0}
00:45.930 --> 00:46.080
{"text": "and", "confidence": 1.0}
00:46.080 --> 00:46.650
{"text": "capture", "confidence": 1.0}
00:47.040 --> 00:47.160
{"text": "a", "confidence": 1.0}
00:47.160 --> 00:47.370
{"text": "few", "confidence": 1.0}
00:47.370 --> 00:47.460
{"text": "of", "confidence": 1.0}
00:47.460 --> 00:47.700
{"text": "those", "confidence": 1.0}
00:47.700 --> 00:48.210
{"text": "stories", "confidence": 1.0}
00:48.240 --> 00:48.360
{"text": "with", "confidence": 1.0}
00:48.360 --> 00:48.420
{"text": "the", "confidence": 1.0}
00:48.420 --> 00:48.750
{"text": "people", "confidence": 1.0}
00:48.750 --> 00:48.900
{"text": "that", "confidence": 1.0}
00:48.930 --> 00:49.080
{"text": "i've", "confidence": 1.0}
00:49.080 --> 00:49.530
{"text": "met", "confidence": 1.0}
00:49.950 --> 00:50.121
{"text": "and", "confidence": 1.0}
00:50.139 --> 00:50.370
{"text": "people", "confidence": 1.0}
00:50.370 --> 00:50.490
{"text": "that", "confidence": 0.966533}
00:50.490 --> 00:50.748
{"text": "maria", "confidence": 0.314138}
00:50.748 --> 00:50.910
{"text": "it", "confidence": 0.542389}
00:50.934 --> 00:51.120
{"text": "has", "confidence": 1.0}
00:51.120 --> 00:51.570
{"text": "known", "confidence": 0.49303}
00:51.960 --> 00:52.230
{"text": "along", "confidence": 1.0}
00:52.230 --> 00:52.320
{"text": "the", "confidence": 1.0}
00:52.320 --> 00:52.680
{"text": "way", "confidence": 1.0}
00:53.790 --> 00:53.940
{"text": "but", "confidence": 1.0}
00:53.940 --> 00:54.000
{"text": "i'm", "confidence": 1.0}
00:54.000 --> 00:54.090
{"text": "not", "confidence": 1.0}
00:54.090 --> 00:54.210
{"text": "going", "confidence": 1.0}
00:54.210 --> 00:54.270
{"text": "to", "confidence": 1.0}
00:54.270 --> 00:54.630
{"text": "begin", "confidence": 1.0}
00:54.660 --> 00:54.780
{"text": "at", "confidence": 1.0}
00:54.780 --> 00:54.840
{"text": "the", "confidence": 1.0}
00:54.840 --> 00:55.380
{"text": "beginning", "confidence": 1.0}
00:55.920 --> 00:56.130
{"text": "that", "confidence": 1.0}
00:56.130 --> 00:56.250
{"text": "would", "confidence": 1.0}
00:56.250 --> 00:56.340
{"text": "be", "confidence": 1.0}
00:56.340 --> 00:56.730
{"text": "logical", "confidence": 1.0}
00:56.730 --> 00:56.880
{"text": "but", "confidence": 1.0}
00:56.880 --> 00:57.210
{"text": "actually", "confidence": 1.0}
00:57.210 --> 00:57.300
{"text": "have", "confidence": 1.0}
00:57.300 --> 00:57.540
{"text": "already", "confidence": 1.0}
00:57.540 --> 00:58.020
{"text": "begun", "confidence": 1.0}
00:58.050 --> 00:58.680
{"text": "obviously", "confidence": 1.0}
00:58.680 --> 00:58.860
{"text": "and", "confidence": 1.0}
00:58.860 --> 00:59.370
{"text": "today's", "confidence": 1.0}
00:59.370 --> 00:59.970
{"text": "episode", "confidence": 1.0}
Loading…
Cancel
Save