words

4 months ago · e4fcd42f23
parent 738d07dc3e
commit e4fcd42f23
9 changed files with 707 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,3 @@
-*.mp3
 *~
 # python stuff
 __pycache__/
--- a/mixcloud/.gitignore
+++ b/mixcloud/.gitignore
@ -0,0 +1,2 @@
+*.mp3
+*.wav
--- a/mixcloud/fun_with_subtitles_02.sh.txt
+++ b/mixcloud/fun_with_subtitles_02.sh.txt
@ -1,6 +1,9 @@
+#!/bin/bash
+#
 # vosk can also output JSON that includes the timing of each individual
 # detected WORD!
 # NOTE: I had an error when I did this and needed to PATCH some PYTHON code in VOSK
 # see VOSKPATCH.TXT
-vosk-transcriber -l en-us -i worm25_mia_60.wav -t json -o worm25_mia_60.json
+
+vosk-transcriber -l en-us -i w25mia60.wav -t json -o w25mia60.json

--- a/mixcloud/scripts/srt2vtt.py
+++ b/mixcloud/scripts/srt2vtt.py
--- a/mixcloud/scripts/timecode.py
+++ b/mixcloud/scripts/timecode.py
@ -0,0 +1,96 @@
+# This file is part of Active Archives.
+# Copyright 2006-2016 the Active Archives contributors (see AUTHORS)
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# Also add information on how to contact you by electronic and paper mail.
+
+from __future__ import print_function
+import math
+import re
+
+# timecode_pat = re.compile(r"(\d+):(\d+):(\d+)(?:[.,](\d+))?")
+timecode_pat = re.compile(r"(?:(\d+):)?(\d+):(\d+)(?:[.,](\d+))?")
+
+
+def timecode_fromsecs(rawsecs, fract=True, alwaysfract=False, fractdelim=',', alwayshours=False):
+    # returns a string in HH:MM:SS[.xxx] notation
+    # if fract is True, uses .xxx if either necessary (non-zero)
+    # OR alwaysfract is True
+    hours = math.floor(rawsecs / 3600)
+    rawsecs -= hours * 3600
+    mins = math.floor(rawsecs / 60)
+    rawsecs -= mins * 60
+    if fract:
+        secs = math.floor(rawsecs)
+        rawsecs -= secs
+        if (rawsecs > 0 or alwaysfract):
+            fract = "%.03f" % rawsecs
+            if hours or alwayshours:
+                return "%02d:%02d:%02d%s%s" % (hours, mins, secs, fractdelim, \
+                        fract[2:])
+            else:
+                return "%02d:%02d%s%s" % (mins, secs, fractdelim, fract[2:])
+        else:
+            if hours or alwayshours:
+                return "%02d:%02d:%02d" % (hours, mins, secs)
+            else:
+                return "%02d:%02d" % (mins, secs)
+
+    else:
+        secs = round(rawsecs)
+        if hours or alwayshours:
+            return "%02d:%02d:%02d" % (hours, mins, secs)
+        else:
+            return "%02d:%02d" % (mins, secs)
+
+
+def timecode_tosecs(tcstr):
+    r = timecode_pat.search(tcstr)
+    if r:
+        ret = 0
+        if r.group(1):
+            ret += 3600 * int(r.group(1))
+        ret += 60 * int(r.group(2))
+        ret += int(r.group(3))
+        if (r.group(4)):
+            ret = float(str(ret) + "." + r.group(4))
+        return ret
+    else:
+        return None
+
+
+def parse2secs(val):
+    try:
+        return float(val)
+    except ValueError:
+        return timecode_tosecs(val)
+## to accept None
+#    except TypeError:
+#        return
+
+if __name__ == "__main__":
+    def t(x):
+        # with fraction
+        s = timecode_fromsecs(x, True, False)
+        print (x, "=>", s, "=>", timecode_tosecs(s))
+        # without fraction
+        s = timecode_fromsecs(x, False)
+        print (x, "=>", s, "=>", timecode_tosecs(s))
+
+    t(0)
+    t(59.666666666666666)
+    t(60)
+    t(60.0)
+    t(1235 / 3.0)
+    t(10000.5)
--- a/mixcloud/scripts/voskjson2vtt.py
+++ b/mixcloud/scripts/voskjson2vtt.py
@ -0,0 +1,30 @@
+#!/usr/bin/env python
+import json
+import argparse
+import sys 
+
+# requires: timecode.py
+# some functions to help working with (srt/vtt) timecodes
+from timecode import timecode_fromsecs
+
+ap = argparse.ArgumentParser("convert VOSK json output into a special vtt with timed json per word")
+ap.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
+ap.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
+args = ap.parse_args()
+
+
+data = json.load(args.infile)
+
+def tc(s):
+    return timecode_fromsecs(s, alwaysfract=True, fractdelim=".")
+
+print ("WEBVTT", file=args.outfile)
+print (file=args.outfile)
+for m in data['monologues']:
+    for term in m['terms']:
+        if term['type'] == "WORD":
+            print (f"{tc(term['start'])} --> {tc(term['end'])}", file=args.outfile)
+            # print (f"{term['text']}")
+            tterm = {'text': term['text'], 'confidence': term['confidence']}
+            print (json.dumps(tterm), file=args.outfile)
+            print (file=args.outfile)
--- a/mixcloud/vtt_words.html
+++ b/mixcloud/vtt_words.html
@ -0,0 +1,56 @@
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
+<head>
+    <meta charset="utf-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
+    <style>
+        #word {
+          position:relative;
+          text-align: center;
+        }
+        #word .content {
+          font-size: 100px;
+          color: black;
+        }
+        #word .content.iffy {
+          color: lightpink;
+        }
+    </style>
+      
+  </head>
+  <body>
+    <audio controls style="width: 100%">
+      <track kind="metadata" id="metadata" label="captions" src="w25mia60_words.vtt"></track>
+      <source src="w25mia60.mp3">
+    </audio>    
+    <div id="word">
+      <div class="content">hello</div>
+    </div>
+    <script>
+
+const word_content = document.querySelector("#word .content");
+const track = document.querySelector("track#metadata");
+track.addEventListener("cuechange", function (e) {
+    // console.log(`track: cuechange: ${this}`);
+    if (this.track.activeCues) {
+        let word = this.track.activeCues[0]?.text;
+        console.log("word", word);
+        if (word) {
+            word = JSON.parse(word);
+            word_content.innerText = word['text'];
+            word_content.style.fontSize = (100 * word['confidence']) + "px";
+            if (word['confidence'] < 1.0 && !word_content.classList.contains("iffy")) {
+              word_content.classList.add("iffy");
+            } else if (word_content.classList.contains("iffy")) {
+              word_content.classList.remove("iffy");
+            }           
+        }
+    }
+});
+track.track.mode = "hidden";
+
+    </script>
+    </body>
+</html>
+
+
--- a/mixcloud/w25mia60.json
+++ b/mixcloud/w25mia60.json
--- a/mixcloud/w25mia60_words.vtt
+++ b/mixcloud/w25mia60_words.vtt
@ -0,0 +1,518 @@
+WEBVTT
+
+00:00.076 --> 00:00.270
+{"text": "have", "confidence": 1.0}
+
+00:00.270 --> 00:00.330
+{"text": "a", "confidence": 1.0}
+
+00:00.330 --> 00:00.870
+{"text": "particular", "confidence": 1.0}
+
+00:00.870 --> 00:01.560
+{"text": "connection", "confidence": 1.0}
+
+00:01.650 --> 00:01.800
+{"text": "to", "confidence": 1.0}
+
+00:01.800 --> 00:02.190
+{"text": "worms", "confidence": 1.0}
+
+00:02.250 --> 00:02.610
+{"text": "inner", "confidence": 1.0}
+
+00:02.610 --> 00:02.940
+{"text": "and", "confidence": 1.0}
+
+00:02.970 --> 00:03.330
+{"text": "outer", "confidence": 1.0}
+
+00:03.390 --> 00:04.140
+{"text": "workings", "confidence": 1.0}
+
+00:04.980 --> 00:05.070
+{"text": "i", "confidence": 1.0}
+
+00:05.070 --> 00:05.340
+{"text": "wanted", "confidence": 1.0}
+
+00:05.340 --> 00:05.700
+{"text": "to", "confidence": 1.0}
+
+00:05.730 --> 00:05.970
+{"text": "make", "confidence": 1.0}
+
+00:05.970 --> 00:06.060
+{"text": "the", "confidence": 1.0}
+
+00:06.060 --> 00:06.600
+{"text": "series", "confidence": 1.0}
+
+00:06.600 --> 00:07.110
+{"text": "because", "confidence": 1.0}
+
+00:07.140 --> 00:07.440
+{"text": "one", "confidence": 0.715959}
+
+00:07.440 --> 00:07.590
+{"text": "has", "confidence": 1.0}
+
+00:07.590 --> 00:08.190
+{"text": "become", "confidence": 1.0}
+
+00:08.280 --> 00:08.640
+{"text": "over", "confidence": 1.0}
+
+00:08.640 --> 00:08.940
+{"text": "twenty", "confidence": 1.0}
+
+00:08.940 --> 00:09.210
+{"text": "five", "confidence": 1.0}
+
+00:09.210 --> 00:09.690
+{"text": "years", "confidence": 1.0}
+
+00:09.690 --> 00:09.870
+{"text": "and", "confidence": 0.868049}
+
+00:09.870 --> 00:10.710
+{"text": "institution", "confidence": 1.0}
+
+00:11.460 --> 00:11.610
+{"text": "but", "confidence": 1.0}
+
+00:11.610 --> 00:11.700
+{"text": "an", "confidence": 1.0}
+
+00:11.700 --> 00:12.360
+{"text": "institution", "confidence": 1.0}
+
+00:12.360 --> 00:12.750
+{"text": "built", "confidence": 1.0}
+
+00:13.140 --> 00:13.495
+{"text": "with", "confidence": 1.0}
+
+00:13.500 --> 00:13.620
+{"text": "and", "confidence": 0.908873}
+
+00:13.620 --> 00:14.130
+{"text": "for", "confidence": 0.664659}
+
+00:14.130 --> 00:14.280
+{"text": "and", "confidence": 1.0}
+
+00:14.280 --> 00:14.850
+{"text": "from", "confidence": 1.0}
+
+00:14.880 --> 00:14.970
+{"text": "a", "confidence": 1.0}
+
+00:14.970 --> 00:15.120
+{"text": "d", "confidence": 1.0}
+
+00:15.120 --> 00:15.240
+{"text": "i", "confidence": 1.0}
+
+00:15.240 --> 00:15.480
+{"text": "y", "confidence": 1.0}
+
+00:15.480 --> 00:16.110
+{"text": "spirits", "confidence": 1.0}
+
+00:16.590 --> 00:16.740
+{"text": "and", "confidence": 0.821086}
+
+00:16.740 --> 00:17.160
+{"text": "something", "confidence": 1.0}
+
+00:17.220 --> 00:17.400
+{"text": "that", "confidence": 1.0}
+
+00:17.400 --> 00:17.460
+{"text": "it", "confidence": 1.0}
+
+00:17.460 --> 00:17.670
+{"text": "still", "confidence": 1.0}
+
+00:17.670 --> 00:18.390
+{"text": "retains", "confidence": 1.0}
+
+00:18.900 --> 00:19.080
+{"text": "it's", "confidence": 1.0}
+
+00:19.080 --> 00:19.170
+{"text": "the", "confidence": 1.0}
+
+00:19.170 --> 00:19.470
+{"text": "nature", "confidence": 1.0}
+
+00:19.470 --> 00:19.560
+{"text": "of", "confidence": 1.0}
+
+00:19.560 --> 00:19.710
+{"text": "do", "confidence": 1.0}
+
+00:19.710 --> 00:19.800
+{"text": "i", "confidence": 1.0}
+
+00:19.800 --> 00:20.018
+{"text": "was", "confidence": 1.0}
+
+00:20.018 --> 00:20.490
+{"text": "spaces", "confidence": 0.697723}
+
+00:20.490 --> 00:20.640
+{"text": "of", "confidence": 1.0}
+
+00:20.640 --> 00:21.060
+{"text": "course", "confidence": 1.0}
+
+00:21.060 --> 00:21.240
+{"text": "and", "confidence": 1.0}
+
+00:21.240 --> 00:21.420
+{"text": "any", "confidence": 1.0}
+
+00:21.420 --> 00:21.810
+{"text": "cultural", "confidence": 1.0}
+
+00:21.810 --> 00:22.410
+{"text": "spaces", "confidence": 1.0}
+
+00:22.470 --> 00:22.710
+{"text": "that", "confidence": 1.0}
+
+00:22.710 --> 00:23.040
+{"text": "people", "confidence": 1.0}
+
+00:23.040 --> 00:23.190
+{"text": "who", "confidence": 1.0}
+
+00:23.190 --> 00:23.490
+{"text": "make", "confidence": 1.0}
+
+00:23.490 --> 00:23.610
+{"text": "it", "confidence": 1.0}
+
+00:23.610 --> 00:23.880
+{"text": "come", "confidence": 1.0}
+
+00:23.880 --> 00:23.970
+{"text": "and", "confidence": 1.0}
+
+00:23.970 --> 00:24.390
+{"text": "go", "confidence": 1.0}
+
+00:24.570 --> 00:24.690
+{"text": "the", "confidence": 1.0}
+
+00:24.690 --> 00:24.900
+{"text": "name", "confidence": 1.0}
+
+00:24.900 --> 00:25.020
+{"text": "on", "confidence": 1.0}
+
+00:25.020 --> 00:25.080
+{"text": "the", "confidence": 1.0}
+
+00:25.080 --> 00:25.410
+{"text": "building", "confidence": 1.0}
+
+00:25.410 --> 00:25.740
+{"text": "stays", "confidence": 1.0}
+
+00:25.740 --> 00:25.830
+{"text": "the", "confidence": 1.0}
+
+00:25.830 --> 00:26.160
+{"text": "same", "confidence": 1.0}
+
+00:26.160 --> 00:26.370
+{"text": "but", "confidence": 1.0}
+
+00:26.370 --> 00:26.460
+{"text": "the", "confidence": 1.0}
+
+00:26.460 --> 00:26.880
+{"text": "vibe", "confidence": 1.0}
+
+00:26.970 --> 00:27.630
+{"text": "changes", "confidence": 1.0}
+
+00:28.020 --> 00:28.200
+{"text": "and", "confidence": 1.0}
+
+00:28.200 --> 00:28.260
+{"text": "the", "confidence": 1.0}
+
+00:28.260 --> 00:28.560
+{"text": "sense", "confidence": 1.0}
+
+00:28.560 --> 00:28.680
+{"text": "of", "confidence": 1.0}
+
+00:28.680 --> 00:29.190
+{"text": "identity", "confidence": 1.0}
+
+00:29.190 --> 00:29.880
+{"text": "shifts", "confidence": 1.0}
+
+00:31.020 --> 00:31.440
+{"text": "worm", "confidence": 1.0}
+
+00:31.470 --> 00:31.710
+{"text": "is", "confidence": 1.0}
+
+00:31.710 --> 00:31.770
+{"text": "a", "confidence": 1.0}
+
+00:31.770 --> 00:32.070
+{"text": "place", "confidence": 1.0}
+
+00:32.070 --> 00:32.190
+{"text": "of", "confidence": 1.0}
+
+00:32.190 --> 00:32.850
+{"text": "invention", "confidence": 1.0}
+
+00:32.850 --> 00:33.000
+{"text": "and", "confidence": 1.0}
+
+00:33.000 --> 00:33.810
+{"text": "reinvention", "confidence": 1.0}
+
+00:33.810 --> 00:34.170
+{"text": "and", "confidence": 1.0}
+
+00:34.230 --> 00:34.410
+{"text": "with", "confidence": 0.722413}
+
+00:34.410 --> 00:34.500
+{"text": "the", "confidence": 1.0}
+
+00:34.500 --> 00:34.920
+{"text": "constant", "confidence": 1.0}
+
+00:34.920 --> 00:35.460
+{"text": "motion", "confidence": 1.0}
+
+00:35.520 --> 00:35.880
+{"text": "certain", "confidence": 1.0}
+
+00:35.880 --> 00:36.390
+{"text": "stories", "confidence": 1.0}
+
+00:36.390 --> 00:36.510
+{"text": "and", "confidence": 1.0}
+
+00:36.510 --> 00:37.050
+{"text": "memories", "confidence": 1.0}
+
+00:37.080 --> 00:37.380
+{"text": "do", "confidence": 0.854091}
+
+00:37.410 --> 00:37.680
+{"text": "fade", "confidence": 1.0}
+
+00:37.680 --> 00:38.070
+{"text": "away", "confidence": 1.0}
+
+00:38.640 --> 00:39.120
+{"text": "sometimes", "confidence": 1.0}
+
+00:39.120 --> 00:39.330
+{"text": "quite", "confidence": 1.0}
+
+00:39.330 --> 00:39.870
+{"text": "quickly", "confidence": 1.0}
+
+00:40.770 --> 00:40.890
+{"text": "the", "confidence": 1.0}
+
+00:40.890 --> 00:41.160
+{"text": "idea", "confidence": 1.0}
+
+00:41.160 --> 00:41.310
+{"text": "of", "confidence": 1.0}
+
+00:41.340 --> 00:41.730
+{"text": "this", "confidence": 1.0}
+
+00:41.790 --> 00:42.240
+{"text": "history", "confidence": 1.0}
+
+00:42.240 --> 00:42.362
+{"text": "of", "confidence": 1.0}
+
+00:42.362 --> 00:42.564
+{"text": "why", "confidence": 0.478323}
+
+00:42.564 --> 00:42.780
+{"text": "i'm", "confidence": 1.0}
+
+00:42.870 --> 00:43.080
+{"text": "not", "confidence": 1.0}
+
+00:43.080 --> 00:43.200
+{"text": "the", "confidence": 1.0}
+
+00:43.260 --> 00:43.620
+{"text": "only", "confidence": 1.0}
+
+00:43.650 --> 00:44.130
+{"text": "possible", "confidence": 1.0}
+
+00:44.160 --> 00:44.640
+{"text": "history", "confidence": 1.0}
+
+00:44.670 --> 00:44.790
+{"text": "of", "confidence": 1.0}
+
+00:44.790 --> 00:45.270
+{"text": "course", "confidence": 1.0}
+
+00:45.330 --> 00:45.510
+{"text": "is", "confidence": 1.0}
+
+00:45.510 --> 00:45.630
+{"text": "to", "confidence": 1.0}
+
+00:45.630 --> 00:45.930
+{"text": "try", "confidence": 1.0}
+
+00:45.930 --> 00:46.080
+{"text": "and", "confidence": 1.0}
+
+00:46.080 --> 00:46.650
+{"text": "capture", "confidence": 1.0}
+
+00:47.040 --> 00:47.160
+{"text": "a", "confidence": 1.0}
+
+00:47.160 --> 00:47.370
+{"text": "few", "confidence": 1.0}
+
+00:47.370 --> 00:47.460
+{"text": "of", "confidence": 1.0}
+
+00:47.460 --> 00:47.700
+{"text": "those", "confidence": 1.0}
+
+00:47.700 --> 00:48.210
+{"text": "stories", "confidence": 1.0}
+
+00:48.240 --> 00:48.360
+{"text": "with", "confidence": 1.0}
+
+00:48.360 --> 00:48.420
+{"text": "the", "confidence": 1.0}
+
+00:48.420 --> 00:48.750
+{"text": "people", "confidence": 1.0}
+
+00:48.750 --> 00:48.900
+{"text": "that", "confidence": 1.0}
+
+00:48.930 --> 00:49.080
+{"text": "i've", "confidence": 1.0}
+
+00:49.080 --> 00:49.530
+{"text": "met", "confidence": 1.0}
+
+00:49.950 --> 00:50.121
+{"text": "and", "confidence": 1.0}
+
+00:50.139 --> 00:50.370
+{"text": "people", "confidence": 1.0}
+
+00:50.370 --> 00:50.490
+{"text": "that", "confidence": 0.966533}
+
+00:50.490 --> 00:50.748
+{"text": "maria", "confidence": 0.314138}
+
+00:50.748 --> 00:50.910
+{"text": "it", "confidence": 0.542389}
+
+00:50.934 --> 00:51.120
+{"text": "has", "confidence": 1.0}
+
+00:51.120 --> 00:51.570
+{"text": "known", "confidence": 0.49303}
+
+00:51.960 --> 00:52.230
+{"text": "along", "confidence": 1.0}
+
+00:52.230 --> 00:52.320
+{"text": "the", "confidence": 1.0}
+
+00:52.320 --> 00:52.680
+{"text": "way", "confidence": 1.0}
+
+00:53.790 --> 00:53.940
+{"text": "but", "confidence": 1.0}
+
+00:53.940 --> 00:54.000
+{"text": "i'm", "confidence": 1.0}
+
+00:54.000 --> 00:54.090
+{"text": "not", "confidence": 1.0}
+
+00:54.090 --> 00:54.210
+{"text": "going", "confidence": 1.0}
+
+00:54.210 --> 00:54.270
+{"text": "to", "confidence": 1.0}
+
+00:54.270 --> 00:54.630
+{"text": "begin", "confidence": 1.0}
+
+00:54.660 --> 00:54.780
+{"text": "at", "confidence": 1.0}
+
+00:54.780 --> 00:54.840
+{"text": "the", "confidence": 1.0}
+
+00:54.840 --> 00:55.380
+{"text": "beginning", "confidence": 1.0}
+
+00:55.920 --> 00:56.130
+{"text": "that", "confidence": 1.0}
+
+00:56.130 --> 00:56.250
+{"text": "would", "confidence": 1.0}
+
+00:56.250 --> 00:56.340
+{"text": "be", "confidence": 1.0}
+
+00:56.340 --> 00:56.730
+{"text": "logical", "confidence": 1.0}
+
+00:56.730 --> 00:56.880
+{"text": "but", "confidence": 1.0}
+
+00:56.880 --> 00:57.210
+{"text": "actually", "confidence": 1.0}
+
+00:57.210 --> 00:57.300
+{"text": "have", "confidence": 1.0}
+
+00:57.300 --> 00:57.540
+{"text": "already", "confidence": 1.0}
+
+00:57.540 --> 00:58.020
+{"text": "begun", "confidence": 1.0}
+
+00:58.050 --> 00:58.680
+{"text": "obviously", "confidence": 1.0}
+
+00:58.680 --> 00:58.860
+{"text": "and", "confidence": 1.0}
+
+00:58.860 --> 00:59.370
+{"text": "today's", "confidence": 1.0}
+
+00:59.370 --> 00:59.970
+{"text": "episode", "confidence": 1.0}
+