yt-video-to-summary / utils /subtitles.py
zman1x1's picture
Upload 21 files
3456a58
raw
history blame
2.04 kB
from youtube_transcript_api import YouTubeTranscriptApi as ytapi
from youtube_transcript_api.formatters import TextFormatter
import json
def getSubsText(video_id="", getGenerated=False):
tList = ytapi.list_transcripts(video_id)
if getGenerated:
# TODO: implement getGenerated
pass
for t in tList:
data = t.fetch()
return (TextFormatter().format_transcript(data)).replace("\n", " ")
def getSubs(video_id="", getGenerated=False, chunker=None):
tList = ytapi.list_transcripts(video_id)
if getGenerated:
pass
for t in tList:
data = t.fetch()
return data
class subs:
def __init__(self, video_id="", generated=False):
self.video_id = video_id
self.generated = generated
self.subs = getSubs(video_id, generated)
def __sizeof__(self) -> int:
count = 0
for _ in self.subs:
count += 1
return count
def getText(self):
return (TextFormatter().format_transcript(self.subs)).replace("\n", " ")
def getSubs(self):
subs = self.subs
# [chunk, duration]
c_d_subs = '\n'.join(f"{subs['text']}:::{subs['duration']}" for subs in subs)
return c_d_subs
def getSubsRaw(self):
return self.subs
def getSubsList(self, size=100):
subs = json.loads(json.dumps(self.subs))
chunks = []
current_chunk = "" # limited to {size}
current_duaration = 0 # TODO: add better variable name
for subline in subs:
current_duaration = subline["start"]
if len(current_chunk) + len(subline["text"]) + 1 <= size:
current_chunk += f"{subline['text']} "
else:
chunks.append(
[
current_chunk.strip(),
current_duaration
]
)
current_chunk = f"{subline['text']} "
return chunks