Panda70M / panda70m_downloader.py
ubuntu
Initial Commit
c25690f
import os
import shutil
import pandas as pd
from vidfetch import compress_folder, pull_from_hf
def download_video_links(hf_token: str, filename: str, save_dir: str):
# check save dir
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# download
pull_from_hf(
hf_token=hf_token,
hf_repo_id="OpenVideo/Panda-70M-Original-Links",
filename=filename,
save_dir=save_dir
)
def download_videos_by_csv(
csv_file_path: str,
save_dir: str,
targz_filename: str,
):
try:
import youtube_dl
except:
raise ModuleNotFoundError(
"youtube_dl missed, please install it by ``vidfetch.package.youtube.youtube_dl_install_helper``"
)
# path/dir
folder_name = targz_filename.replace(".tar.gz", "")
download_videos_dir = os.path.join(save_dir, folder_name, "download_raw")
log_path = os.path.join(download_videos_dir, "log.txt")
targz_path = os.path.join(save_dir, targz_filename)
# make dirs
if not os.path.exists(download_videos_dir):
os.makedirs(download_videos_dir)
# read from csv
csv_filename = os.path.basename(csv_file_path)
shutil.copy(src=csv_file_path, dst=os.path.join(download_videos_dir, csv_filename))
data = pd.read_csv(csv_file_path)
links = data["url"].tolist()
videos_id = data["videoID"].to_list()
failed_links = [] # record failed links
for link, video_id in zip(links, videos_id):
# check if downloaded
video_save_path = os.path.join(download_videos_dir, video_id[1:]+".mp4")
if os.path.exists(video_save_path):
continue
# download
ydl_opts = {
'format': 'best',
'quiet': False,
'outtmpl': os.path.join(download_videos_dir, video_id[1:]+".mp4"),
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
try:
ydl.download([link])
except:
failed_links.append(link)
# delete videos larger than 100MB
video_files = os.listdir(download_videos_dir)
delete_videos = []
for file in video_files:
file_path = os.path.join(download_videos_dir, file)
file_size_mb = os.path.getsize(file_path) / (1024 * 1024) # Convert to megabytes
if file_size_mb > 500:
delete_videos.append(file_path)
os.remove(file_path)
# Write to log file
with open(log_path, 'w') as file:
file.write('Fail to download\n')
file.write('\n'.join(failed_links))
file.write('Delete videos larger than 500MB\n')
file.write('\n'.join(failed_links))
compress_folder(download_videos_dir, targz_path)