words

1 month ago · e4fcd42f23
parent 738d07dc3e
commit e4fcd42f23
9 changed files with 707 additions and 2 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,4 +1,3 @@
 *.mp3
 *~
 # python stuff
 __pycache__/
--- a/mixcloud/.gitignore
+++ b/mixcloud/.gitignore
@ -0,0 +1,2 @@
 *.mp3
 *.wav
--- a/mixcloud/fun_with_subtitles_02.sh.txt
+++ b/mixcloud/fun_with_subtitles_02.sh.txt
@ -1,6 +1,9 @@
 #!/bin/bash
 #
 # vosk can also output JSON that includes the timing of each individual
 # detected WORD!
 # NOTE: I had an error when I did this and needed to PATCH some PYTHON code in VOSK
 # see VOSKPATCH.TXT
-vosk-transcriber -l en-us -i worm25_mia_60.wav -t json -o worm25_mia_60.json
+
 vosk-transcriber -l en-us -i w25mia60.wav -t json -o w25mia60.json
--- a/mixcloud/scripts/srt2vtt.py
+++ b/mixcloud/scripts/srt2vtt.py
--- a/mixcloud/scripts/timecode.py
+++ b/mixcloud/scripts/timecode.py
@ -0,0 +1,96 @@
 # This file is part of Active Archives.
 # Copyright 2006-2016 the Active Archives contributors (see AUTHORS)
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as
 # published by the Free Software Foundation, either version 3 of the
 # License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU Affero General Public License for more details.
 #
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # Also add information on how to contact you by electronic and paper mail.
 from __future__ import print_function
 import math
 import re
 # timecode_pat = re.compile(r"(\d+):(\d+):(\d+)(?:[.,](\d+))?")
 timecode_pat = re.compile(r"(?:(\d+):)?(\d+):(\d+)(?:[.,](\d+))?")
 def timecode_fromsecs(rawsecs, fract=True, alwaysfract=False, fractdelim=',', alwayshours=False):
    # returns a string in HH:MM:SS[.xxx] notation
    # if fract is True, uses .xxx if either necessary (non-zero)
    # OR alwaysfract is True
    hours = math.floor(rawsecs / 3600)
    rawsecs -= hours * 3600
    mins = math.floor(rawsecs / 60)
    rawsecs -= mins * 60
    if fract:
        secs = math.floor(rawsecs)
        rawsecs -= secs
        if (rawsecs > 0 or alwaysfract):
            fract = "%.03f" % rawsecs
            if hours or alwayshours:
                return "%02d:%02d:%02d%s%s" % (hours, mins, secs, fractdelim, \
                        fract[2:])
            else:
                return "%02d:%02d%s%s" % (mins, secs, fractdelim, fract[2:])
        else:
            if hours or alwayshours:
                return "%02d:%02d:%02d" % (hours, mins, secs)
            else:
                return "%02d:%02d" % (mins, secs)
    else:
        secs = round(rawsecs)
        if hours or alwayshours:
            return "%02d:%02d:%02d" % (hours, mins, secs)
        else:
            return "%02d:%02d" % (mins, secs)
 def timecode_tosecs(tcstr):
    r = timecode_pat.search(tcstr)
    if r:
        ret = 0
        if r.group(1):
            ret += 3600 * int(r.group(1))
        ret += 60 * int(r.group(2))
        ret += int(r.group(3))
        if (r.group(4)):
            ret = float(str(ret) + "." + r.group(4))
        return ret
    else:
        return None
 def parse2secs(val):
    try:
        return float(val)
    except ValueError:
        return timecode_tosecs(val)
 ## to accept None
 #    except TypeError:
 #        return
 if __name__ == "__main__":
    def t(x):
        # with fraction
        s = timecode_fromsecs(x, True, False)
        print (x, "=>", s, "=>", timecode_tosecs(s))
        # without fraction
        s = timecode_fromsecs(x, False)
        print (x, "=>", s, "=>", timecode_tosecs(s))
    t(0)
    t(59.666666666666666)
    t(60)
    t(60.0)
    t(1235 / 3.0)
    t(10000.5)
--- a/mixcloud/scripts/voskjson2vtt.py
+++ b/mixcloud/scripts/voskjson2vtt.py
@ -0,0 +1,30 @@
 #!/usr/bin/env python
 import json
 import argparse
 import sys 
 # requires: timecode.py
 # some functions to help working with (srt/vtt) timecodes
 from timecode import timecode_fromsecs
 ap = argparse.ArgumentParser("convert VOSK json output into a special vtt with timed json per word")
 ap.add_argument('infile', nargs='?', type=argparse.FileType('r'), default=sys.stdin)
 ap.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout)
 args = ap.parse_args()
 data = json.load(args.infile)
 def tc(s):
    return timecode_fromsecs(s, alwaysfract=True, fractdelim=".")
 print ("WEBVTT", file=args.outfile)
 print (file=args.outfile)
 for m in data['monologues']:
    for term in m['terms']:
        if term['type'] == "WORD":
            print (f"{tc(term['start'])} --> {tc(term['end'])}", file=args.outfile)
            # print (f"{term['text']}")
            tterm = {'text': term['text'], 'confidence': term['confidence']}
            print (json.dumps(tterm), file=args.outfile)
            print (file=args.outfile)
--- a/mixcloud/vtt_words.html
+++ b/mixcloud/vtt_words.html
@ -0,0 +1,56 @@
 <!DOCTYPE html>
 <html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang="">
 <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" />
    <style>
        #word {
          position:relative;
          text-align: center;
        }
        #word .content {
          font-size: 100px;
          color: black;
        }
        #word .content.iffy {
          color: lightpink;
        }
    </style>
  </head>
  <body>
    <audio controls style="width: 100%">
      <track kind="metadata" id="metadata" label="captions" src="w25mia60_words.vtt"></track>
      <source src="w25mia60.mp3">
    </audio>    
    <div id="word">
      <div class="content">hello</div>
    </div>
    <script>
 const word_content = document.querySelector("#word .content");
 const track = document.querySelector("track#metadata");
 track.addEventListener("cuechange", function (e) {
    // console.log(`track: cuechange: ${this}`);
    if (this.track.activeCues) {
        let word = this.track.activeCues[0]?.text;
        console.log("word", word);
        if (word) {
            word = JSON.parse(word);
            word_content.innerText = word['text'];
            word_content.style.fontSize = (100 * word['confidence']) + "px";
            if (word['confidence'] < 1.0 && !word_content.classList.contains("iffy")) {
              word_content.classList.add("iffy");
            } else if (word_content.classList.contains("iffy")) {
              word_content.classList.remove("iffy");
            }           
        }
    }
 });
 track.track.mode = "hidden";
    </script>
    </body>
 </html>
--- a/mixcloud/w25mia60.json
+++ b/mixcloud/w25mia60.json
--- a/mixcloud/w25mia60_words.vtt
+++ b/mixcloud/w25mia60_words.vtt
@ -0,0 +1,518 @@
 WEBVTT
 00:00.076 --> 00:00.270
 {"text": "have", "confidence": 1.0}
 00:00.270 --> 00:00.330
 {"text": "a", "confidence": 1.0}
 00:00.330 --> 00:00.870
 {"text": "particular", "confidence": 1.0}
 00:00.870 --> 00:01.560
 {"text": "connection", "confidence": 1.0}
 00:01.650 --> 00:01.800
 {"text": "to", "confidence": 1.0}
 00:01.800 --> 00:02.190
 {"text": "worms", "confidence": 1.0}
 00:02.250 --> 00:02.610
 {"text": "inner", "confidence": 1.0}
 00:02.610 --> 00:02.940
 {"text": "and", "confidence": 1.0}
 00:02.970 --> 00:03.330
 {"text": "outer", "confidence": 1.0}
 00:03.390 --> 00:04.140
 {"text": "workings", "confidence": 1.0}
 00:04.980 --> 00:05.070
 {"text": "i", "confidence": 1.0}
 00:05.070 --> 00:05.340
 {"text": "wanted", "confidence": 1.0}
 00:05.340 --> 00:05.700
 {"text": "to", "confidence": 1.0}
 00:05.730 --> 00:05.970
 {"text": "make", "confidence": 1.0}
 00:05.970 --> 00:06.060
 {"text": "the", "confidence": 1.0}
 00:06.060 --> 00:06.600
 {"text": "series", "confidence": 1.0}
 00:06.600 --> 00:07.110
 {"text": "because", "confidence": 1.0}
 00:07.140 --> 00:07.440
 {"text": "one", "confidence": 0.715959}
 00:07.440 --> 00:07.590
 {"text": "has", "confidence": 1.0}
 00:07.590 --> 00:08.190
 {"text": "become", "confidence": 1.0}
 00:08.280 --> 00:08.640
 {"text": "over", "confidence": 1.0}
 00:08.640 --> 00:08.940
 {"text": "twenty", "confidence": 1.0}
 00:08.940 --> 00:09.210
 {"text": "five", "confidence": 1.0}
 00:09.210 --> 00:09.690
 {"text": "years", "confidence": 1.0}
 00:09.690 --> 00:09.870
 {"text": "and", "confidence": 0.868049}
 00:09.870 --> 00:10.710
 {"text": "institution", "confidence": 1.0}
 00:11.460 --> 00:11.610
 {"text": "but", "confidence": 1.0}
 00:11.610 --> 00:11.700
 {"text": "an", "confidence": 1.0}
 00:11.700 --> 00:12.360
 {"text": "institution", "confidence": 1.0}
 00:12.360 --> 00:12.750
 {"text": "built", "confidence": 1.0}
 00:13.140 --> 00:13.495
 {"text": "with", "confidence": 1.0}
 00:13.500 --> 00:13.620
 {"text": "and", "confidence": 0.908873}
 00:13.620 --> 00:14.130
 {"text": "for", "confidence": 0.664659}
 00:14.130 --> 00:14.280
 {"text": "and", "confidence": 1.0}
 00:14.280 --> 00:14.850
 {"text": "from", "confidence": 1.0}
 00:14.880 --> 00:14.970
 {"text": "a", "confidence": 1.0}
 00:14.970 --> 00:15.120
 {"text": "d", "confidence": 1.0}
 00:15.120 --> 00:15.240
 {"text": "i", "confidence": 1.0}
 00:15.240 --> 00:15.480
 {"text": "y", "confidence": 1.0}
 00:15.480 --> 00:16.110
 {"text": "spirits", "confidence": 1.0}
 00:16.590 --> 00:16.740
 {"text": "and", "confidence": 0.821086}
 00:16.740 --> 00:17.160
 {"text": "something", "confidence": 1.0}
 00:17.220 --> 00:17.400
 {"text": "that", "confidence": 1.0}
 00:17.400 --> 00:17.460
 {"text": "it", "confidence": 1.0}
 00:17.460 --> 00:17.670
 {"text": "still", "confidence": 1.0}
 00:17.670 --> 00:18.390
 {"text": "retains", "confidence": 1.0}
 00:18.900 --> 00:19.080
 {"text": "it's", "confidence": 1.0}
 00:19.080 --> 00:19.170
 {"text": "the", "confidence": 1.0}
 00:19.170 --> 00:19.470
 {"text": "nature", "confidence": 1.0}
 00:19.470 --> 00:19.560
 {"text": "of", "confidence": 1.0}
 00:19.560 --> 00:19.710
 {"text": "do", "confidence": 1.0}
 00:19.710 --> 00:19.800
 {"text": "i", "confidence": 1.0}
 00:19.800 --> 00:20.018
 {"text": "was", "confidence": 1.0}
 00:20.018 --> 00:20.490
 {"text": "spaces", "confidence": 0.697723}
 00:20.490 --> 00:20.640
 {"text": "of", "confidence": 1.0}
 00:20.640 --> 00:21.060
 {"text": "course", "confidence": 1.0}
 00:21.060 --> 00:21.240
 {"text": "and", "confidence": 1.0}
 00:21.240 --> 00:21.420
 {"text": "any", "confidence": 1.0}
 00:21.420 --> 00:21.810
 {"text": "cultural", "confidence": 1.0}
 00:21.810 --> 00:22.410
 {"text": "spaces", "confidence": 1.0}
 00:22.470 --> 00:22.710
 {"text": "that", "confidence": 1.0}
 00:22.710 --> 00:23.040
 {"text": "people", "confidence": 1.0}
 00:23.040 --> 00:23.190
 {"text": "who", "confidence": 1.0}
 00:23.190 --> 00:23.490
 {"text": "make", "confidence": 1.0}
 00:23.490 --> 00:23.610
 {"text": "it", "confidence": 1.0}
 00:23.610 --> 00:23.880
 {"text": "come", "confidence": 1.0}
 00:23.880 --> 00:23.970
 {"text": "and", "confidence": 1.0}
 00:23.970 --> 00:24.390
 {"text": "go", "confidence": 1.0}
 00:24.570 --> 00:24.690
 {"text": "the", "confidence": 1.0}
 00:24.690 --> 00:24.900
 {"text": "name", "confidence": 1.0}
 00:24.900 --> 00:25.020
 {"text": "on", "confidence": 1.0}
 00:25.020 --> 00:25.080
 {"text": "the", "confidence": 1.0}
 00:25.080 --> 00:25.410
 {"text": "building", "confidence": 1.0}
 00:25.410 --> 00:25.740
 {"text": "stays", "confidence": 1.0}
 00:25.740 --> 00:25.830
 {"text": "the", "confidence": 1.0}
 00:25.830 --> 00:26.160
 {"text": "same", "confidence": 1.0}
 00:26.160 --> 00:26.370
 {"text": "but", "confidence": 1.0}
 00:26.370 --> 00:26.460
 {"text": "the", "confidence": 1.0}
 00:26.460 --> 00:26.880
 {"text": "vibe", "confidence": 1.0}
 00:26.970 --> 00:27.630
 {"text": "changes", "confidence": 1.0}
 00:28.020 --> 00:28.200
 {"text": "and", "confidence": 1.0}
 00:28.200 --> 00:28.260
 {"text": "the", "confidence": 1.0}
 00:28.260 --> 00:28.560
 {"text": "sense", "confidence": 1.0}
 00:28.560 --> 00:28.680
 {"text": "of", "confidence": 1.0}
 00:28.680 --> 00:29.190
 {"text": "identity", "confidence": 1.0}
 00:29.190 --> 00:29.880
 {"text": "shifts", "confidence": 1.0}
 00:31.020 --> 00:31.440
 {"text": "worm", "confidence": 1.0}
 00:31.470 --> 00:31.710
 {"text": "is", "confidence": 1.0}
 00:31.710 --> 00:31.770
 {"text": "a", "confidence": 1.0}
 00:31.770 --> 00:32.070
 {"text": "place", "confidence": 1.0}
 00:32.070 --> 00:32.190
 {"text": "of", "confidence": 1.0}
 00:32.190 --> 00:32.850
 {"text": "invention", "confidence": 1.0}
 00:32.850 --> 00:33.000
 {"text": "and", "confidence": 1.0}
 00:33.000 --> 00:33.810
 {"text": "reinvention", "confidence": 1.0}
 00:33.810 --> 00:34.170
 {"text": "and", "confidence": 1.0}
 00:34.230 --> 00:34.410
 {"text": "with", "confidence": 0.722413}
 00:34.410 --> 00:34.500
 {"text": "the", "confidence": 1.0}
 00:34.500 --> 00:34.920
 {"text": "constant", "confidence": 1.0}
 00:34.920 --> 00:35.460
 {"text": "motion", "confidence": 1.0}
 00:35.520 --> 00:35.880
 {"text": "certain", "confidence": 1.0}
 00:35.880 --> 00:36.390
 {"text": "stories", "confidence": 1.0}
 00:36.390 --> 00:36.510
 {"text": "and", "confidence": 1.0}
 00:36.510 --> 00:37.050
 {"text": "memories", "confidence": 1.0}
 00:37.080 --> 00:37.380
 {"text": "do", "confidence": 0.854091}
 00:37.410 --> 00:37.680
 {"text": "fade", "confidence": 1.0}
 00:37.680 --> 00:38.070
 {"text": "away", "confidence": 1.0}
 00:38.640 --> 00:39.120
 {"text": "sometimes", "confidence": 1.0}
 00:39.120 --> 00:39.330
 {"text": "quite", "confidence": 1.0}
 00:39.330 --> 00:39.870
 {"text": "quickly", "confidence": 1.0}
 00:40.770 --> 00:40.890
 {"text": "the", "confidence": 1.0}
 00:40.890 --> 00:41.160
 {"text": "idea", "confidence": 1.0}
 00:41.160 --> 00:41.310
 {"text": "of", "confidence": 1.0}
 00:41.340 --> 00:41.730
 {"text": "this", "confidence": 1.0}
 00:41.790 --> 00:42.240
 {"text": "history", "confidence": 1.0}
 00:42.240 --> 00:42.362
 {"text": "of", "confidence": 1.0}
 00:42.362 --> 00:42.564
 {"text": "why", "confidence": 0.478323}
 00:42.564 --> 00:42.780
 {"text": "i'm", "confidence": 1.0}
 00:42.870 --> 00:43.080
 {"text": "not", "confidence": 1.0}
 00:43.080 --> 00:43.200
 {"text": "the", "confidence": 1.0}
 00:43.260 --> 00:43.620
 {"text": "only", "confidence": 1.0}
 00:43.650 --> 00:44.130
 {"text": "possible", "confidence": 1.0}
 00:44.160 --> 00:44.640
 {"text": "history", "confidence": 1.0}
 00:44.670 --> 00:44.790
 {"text": "of", "confidence": 1.0}
 00:44.790 --> 00:45.270
 {"text": "course", "confidence": 1.0}
 00:45.330 --> 00:45.510
 {"text": "is", "confidence": 1.0}
 00:45.510 --> 00:45.630
 {"text": "to", "confidence": 1.0}
 00:45.630 --> 00:45.930
 {"text": "try", "confidence": 1.0}
 00:45.930 --> 00:46.080
 {"text": "and", "confidence": 1.0}
 00:46.080 --> 00:46.650
 {"text": "capture", "confidence": 1.0}
 00:47.040 --> 00:47.160
 {"text": "a", "confidence": 1.0}
 00:47.160 --> 00:47.370
 {"text": "few", "confidence": 1.0}
 00:47.370 --> 00:47.460
 {"text": "of", "confidence": 1.0}
 00:47.460 --> 00:47.700
 {"text": "those", "confidence": 1.0}
 00:47.700 --> 00:48.210
 {"text": "stories", "confidence": 1.0}
 00:48.240 --> 00:48.360
 {"text": "with", "confidence": 1.0}
 00:48.360 --> 00:48.420
 {"text": "the", "confidence": 1.0}
 00:48.420 --> 00:48.750
 {"text": "people", "confidence": 1.0}
 00:48.750 --> 00:48.900
 {"text": "that", "confidence": 1.0}
 00:48.930 --> 00:49.080
 {"text": "i've", "confidence": 1.0}
 00:49.080 --> 00:49.530
 {"text": "met", "confidence": 1.0}
 00:49.950 --> 00:50.121
 {"text": "and", "confidence": 1.0}
 00:50.139 --> 00:50.370
 {"text": "people", "confidence": 1.0}
 00:50.370 --> 00:50.490
 {"text": "that", "confidence": 0.966533}
 00:50.490 --> 00:50.748
 {"text": "maria", "confidence": 0.314138}
 00:50.748 --> 00:50.910
 {"text": "it", "confidence": 0.542389}
 00:50.934 --> 00:51.120
 {"text": "has", "confidence": 1.0}
 00:51.120 --> 00:51.570
 {"text": "known", "confidence": 0.49303}
 00:51.960 --> 00:52.230
 {"text": "along", "confidence": 1.0}
 00:52.230 --> 00:52.320
 {"text": "the", "confidence": 1.0}
 00:52.320 --> 00:52.680
 {"text": "way", "confidence": 1.0}
 00:53.790 --> 00:53.940
 {"text": "but", "confidence": 1.0}
 00:53.940 --> 00:54.000
 {"text": "i'm", "confidence": 1.0}
 00:54.000 --> 00:54.090
 {"text": "not", "confidence": 1.0}
 00:54.090 --> 00:54.210
 {"text": "going", "confidence": 1.0}
 00:54.210 --> 00:54.270
 {"text": "to", "confidence": 1.0}
 00:54.270 --> 00:54.630
 {"text": "begin", "confidence": 1.0}
 00:54.660 --> 00:54.780
 {"text": "at", "confidence": 1.0}
 00:54.780 --> 00:54.840
 {"text": "the", "confidence": 1.0}
 00:54.840 --> 00:55.380
 {"text": "beginning", "confidence": 1.0}
 00:55.920 --> 00:56.130
 {"text": "that", "confidence": 1.0}
 00:56.130 --> 00:56.250
 {"text": "would", "confidence": 1.0}
 00:56.250 --> 00:56.340
 {"text": "be", "confidence": 1.0}
 00:56.340 --> 00:56.730
 {"text": "logical", "confidence": 1.0}
 00:56.730 --> 00:56.880
 {"text": "but", "confidence": 1.0}
 00:56.880 --> 00:57.210
 {"text": "actually", "confidence": 1.0}
 00:57.210 --> 00:57.300
 {"text": "have", "confidence": 1.0}
 00:57.300 --> 00:57.540
 {"text": "already", "confidence": 1.0}
 00:57.540 --> 00:58.020
 {"text": "begun", "confidence": 1.0}
 00:58.050 --> 00:58.680
 {"text": "obviously", "confidence": 1.0}
 00:58.680 --> 00:58.860
 {"text": "and", "confidence": 1.0}
 00:58.860 --> 00:59.370
 {"text": "today's", "confidence": 1.0}
 00:59.370 --> 00:59.970
 {"text": "episode", "confidence": 1.0}