-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
188 lines (166 loc) · 5.72 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
package main
import (
"bytes"
"flag"
"fmt"
"io"
"log/slog"
"os"
"strconv"
"strings"
"time"
"github.com/giulianopz/go-gstt/pkg/client"
"github.com/giulianopz/go-gstt/pkg/logger"
"github.com/giulianopz/go-gstt/pkg/opts"
"github.com/giulianopz/go-gstt/pkg/transcription"
goflac "github.com/go-flac/go-flac"
)
const usage = `Usage:
gstt [OPTION]... --interim --continuous [--file FILE]
Options:
--verbose
--file, path of audio file to trascript
--key, api key built into chromium
--language, language of the recording transcription, use the standard webcodes for your language, i.e. 'en-US' for English-US, 'ru' for Russian, etc. please, see https://en.wikipedia.org/wiki/IETF_language_tag
--continuous, keeps the stream open and transcoding as long as there is no silence
--interim, sends back results before its finished, so you get a live stream of possible transcriptions as it processes the audio
--max-alts, how many possible transcriptions do you want
--pfilter, profanity filter ('0'=off, '1'=medium, '2'=strict)
--user-agent, user-agent for spoofing
--sample-rate, audio sampling rate
--subtitle-mode, shows the transcriptions as if they were subtitles, while playing the media file, clearing the screen at each transcription
`
var (
verbose bool
filePath string
apiKey string
language string
continuous bool
interim bool
maxAlts string
pFilter string
userAgent string
sampleRate int
subtitleMode bool
)
func main() {
flag.BoolVar(&verbose, "verbose", false, "verbose")
flag.StringVar(&filePath, "file", "", "path of audio file to trascript")
flag.StringVar(&apiKey, "key", "", "API key to authenticates request (default is the one built into any Chrome installation)")
flag.StringVar(&language, "language", "null", "language of the recording transcription, use the standard codes for your language, i.e. 'en-US' for English-US, 'ru' for Russian, etc. please, see https://en.wikipedia.org/wiki/IETF_language_tag")
flag.BoolVar(&continuous, "continuous", false, "to keep the stream open and transcoding as long as there is no silence")
flag.BoolVar(&interim, "interim", false, "to send back results before its finished, so you get a live stream of possible transcriptions as it processes the audio")
flag.StringVar(&maxAlts, "max-alts", "1", "how many possible transcriptions do you want")
flag.StringVar(&pFilter, "pfilter", "2", "profanity filter ('0'=off, '1'=medium, '2'=strict)")
flag.StringVar(&userAgent, "user-agent", opts.DefaultUserAgent, "user-agent for spoofing (default 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36')")
flag.IntVar(&sampleRate, "sample-rate", opts.DefaultSampleRate, "audio sampling rate")
flag.BoolVar(&subtitleMode, "subtitle-mode", false, "shows the transcriptions as if they were subtitles, while playing the media file, clearing the screen at each transcription")
flag.Usage = func() { fmt.Print(usage) }
flag.Parse()
if verbose {
logger.Level(slog.LevelDebug)
}
var (
httpC = client.New()
options = fromFlags()
out = make(chan *transcription.Response)
)
if filePath != "" { // transcribe from file
f, err := goflac.ParseFile(filePath)
if err != nil {
logger.Error("cannot parse file", "err", err)
os.Exit(1)
}
data, err := f.GetStreamInfo()
if err != nil {
logger.Error("cannot get file info", "err", err)
os.Exit(1)
}
options.SampleRate = data.SampleRate
logger.Info("done parsing file", "sample rate", data.SampleRate)
go httpC.Transcribe(bytes.NewBuffer(f.Marshal()), out, options)
} else { // transcribe from microphone input
pr, pw := io.Pipe()
defer pr.Close()
defer pw.Close()
go func() {
bs := make([]byte, 1024)
for {
n, err := os.Stdin.Read(bs)
if n > 0 {
logger.Debug("read from stdin", "bs", bs)
_, err := pw.Write(bs)
if err != nil {
panic(err)
}
} else if err == io.EOF {
logger.Info("done reading from stdin")
break
} else if err != nil {
logger.Error("cannot not read from stdin", "err", err)
os.Exit(1)
}
}
}()
go httpC.Transcribe(pr, out, options)
}
for resp := range out {
for _, result := range resp.Result {
if !result.Final {
continue
}
for _, alt := range result.Alternative {
logger.Debug("got transcription", slog.Float64("confidence", alt.Confidence), slog.String("transcript", alt.Transcript))
transcript := strings.TrimSpace(alt.Transcript)
fmt.Printf("%s", transcript)
if subtitleMode {
// Assumimg reading speed = 238 WPM (words per minute)
// see https://thereadtime.com/
time.Sleep(time.Duration(float64(len(strings.Fields(transcript)))*0.26) * time.Second)
// clear the entire screen with ANSI escapes
fmt.Print("\x1b[H\x1b[2J\x1b[3J\n")
} else {
fmt.Println()
}
}
}
}
}
func fromFlags() *opts.Options {
options := make([]opts.Option, 0)
if verbose {
options = append(options, opts.Verbose(true))
}
if filePath != "" {
options = append(options, opts.FilePath(filePath))
}
if apiKey != "" {
options = append(options, opts.ApiKey(apiKey))
}
if language != "" {
options = append(options, opts.Language(language))
}
if continuous {
options = append(options, opts.Continuous(true))
}
if interim {
options = append(options, opts.Interim(true))
}
if maxAlts != "" {
num, err := strconv.Atoi(maxAlts)
if err != nil {
panic(err)
}
options = append(options, opts.MaxAlts(num))
}
if pFilter != "" {
num, err := strconv.Atoi(pFilter)
if err != nil {
panic(err)
}
options = append(options, opts.ProfanityFilter(num))
}
options = append(options, opts.UserAgent(userAgent))
options = append(options, opts.SampleRate(sampleRate))
return opts.Apply(options...)
}