Getting Started with DLFeat
This guide will help you quickly get started with DLFeat to extract features from various modalities.
Installation
First, install core dependencies:
pip install torch torchvision torchaudio scikit-learn Pillow numpy scipy av
pip install transformers sentence-transformers timm requests
For the best experience and access to all models, it’s recommended to keep these libraries, especially transformers and torchvision, up-to-date:
pip install --upgrade torch torchvision torchaudio transformers sentence-transformers timm requests
Note that dlfeat is modular, so if some of those dependencies are not installed, it will work with reduced functionalities.
Then install the package with:
pip install git+https://github.com/antoninofurnari/dlfeat.git
After installing the library, you can run test to see which models are avilable with:
import dlfeat
dlfeat.run_self_tests()
Basic Usage
Here’s how to use DLFeatExtractor for different types of data:
1. Import DLFeat Components
from DLFeat import DLFeatExtractor, list_available_models
import os # For dummy file creation/cleanup in examples
from PIL import Image, ImageDraw # For dummy image creation
import numpy as np_example # Renamed to avoid conflict if DLFeat itself uses np
import scipy.io.wavfile as scipy_wav_example # Renamed
# You can list available models (optional)
# print("All available models:", list_available_models())
# print("Image models:", list_available_models(task_type="image"))
2. Image Feature Extraction
# Initialize for an image model
try:
img_extractor = DLFeatExtractor(model_name="resnet18", task_type="image")
print(f"Successfully initialized resnet18 extractor for images.")
# Create a dummy image for this snippet
dummy_img_path_gs = "temp_dummy_image_gs.png"
try:
img_gs = Image.new('RGB', (224, 224), color = 'skyblue')
d_gs = ImageDraw.Draw(img_gs)
d_gs.text((10,10), "Sample Img", fill=(0,0,0))
img_gs.save(dummy_img_path_gs)
# DLFeat expects a list of inputs for the transform method
image_paths_gs = [dummy_img_path_gs, dummy_img_path_gs]
# Fit (no-op for pre-trained models) and transform
img_extractor.fit(image_paths_gs)
image_features_gs = img_extractor.transform(image_paths_gs)
print(f"Image features shape: {image_features_gs.shape}")
# Example output for resnet18 with 2 images: (2, 512)
except ImportError:
print("Pillow (for ImageDraw) not found, skipping image creation part of snippet.")
except Exception as e_img:
print(f"Error in image feature extraction snippet: {e_img}")
finally:
if os.path.exists(dummy_img_path_gs):
os.remove(dummy_img_path_gs)
except Exception as e_init_img:
print(f"Could not initialize image extractor: {e_init_img}")
print("Ensure dependencies for image models are installed.")
3. Text Feature Extraction
try:
text_extractor = DLFeatExtractor(model_name="sentence-bert", task_type="text")
print("Successfully initialized sentence-bert extractor for text.")
texts_gs = ["This is the first sentence for DLFeat feature extraction.",
"DLFeat aims to make embeddings easy to obtain."]
text_features_gs = text_extractor.transform(texts_gs)
print(f"Text features shape: {text_features_gs.shape}")
# Example output for sentence-bert with 2 sentences: (2, 384)
except Exception as e_text:
print(f"Error in text feature extraction snippet: {e_text}")
print("Ensure dependencies for text models (like sentence-transformers) are installed.")
4. Audio Feature Extraction
Requires torchaudio and scipy for creating the dummy audio in this snippet.
try:
# Create a dummy audio file for this snippet
dummy_audio_path_gs = "temp_dummy_audio_gs.wav"
try:
sample_rate_gs = 16000; duration_gs = 1; frequency_gs = 440
t_gs = np_example.linspace(0, duration_gs, int(sample_rate_gs * duration_gs), endpoint=False)
data_gs = np_example.sin(2 * np_example.pi * frequency_gs * t_gs) * 0.5
data_int16_gs = (data_gs * 32767).astype(np_example.int16)
scipy_wav_example.write(dummy_audio_path_gs, sample_rate_gs, data_int16_gs)
audio_extractor = DLFeatExtractor(model_name="wav2vec2_base", task_type="audio")
print("Successfully initialized wav2vec2_base extractor for audio.")
audio_paths_gs = [dummy_audio_path_gs, dummy_audio_path_gs]
audio_features_gs = audio_extractor.transform(audio_paths_gs)
print(f"Audio features shape: {audio_features_gs.shape}")
# Example output for wav2vec2_base with 2 audio files: (2, 768)
except ImportError:
print("Scipy or Numpy not found, skipping audio creation/processing part of snippet.")
except Exception as e_audio:
print(f"Error in audio feature extraction snippet: {e_audio}")
finally:
if os.path.exists(dummy_audio_path_gs):
os.remove(dummy_audio_path_gs)
except Exception as e_init_audio:
print(f"Could not initialize audio extractor: {e_init_audio}")
print("Ensure dependencies for audio models (like transformers, torchaudio) are installed.")
5. Video Feature Extraction
Note: Video processing can be resource-intensive and slow. The example below uses r2plus1d_18. For actual use, replace placeholder paths with paths to your real video files. The self-test function run_self_tests(attempt_real_video_test=True) attempts to download a small sample video if the requests library is installed.
try:
video_extractor = DLFeatExtractor(model_name="r2plus1d_18", task_type="video")
print("Successfully initialized r2plus1d_18 extractor for video.")
# For actual use, create a list of paths to your video files:
# video_paths = ["path/to/your/video1.mp4", "path/to/your/video2.avi"]
# video_features = video_extractor.transform(video_paths)
# print(f"Video features shape: {video_features.shape}")
print("To test video feature extraction, please provide a list of actual video file paths to the .transform() method.")
print("Example: video_features = video_extractor.transform(['my_video.mp4'])")
except Exception as e_video:
print(f"Could not initialize video extractor or run snippet: {e_video}")
print("Ensure dependencies for video models (like torchvision, transformers, and potentially ffmpeg for video reading) are correctly installed.")
6. Multimodal (Image-Text) Feature Extraction
try:
# Create a dummy image for this snippet
dummy_img_path_mm_gs = "temp_dummy_image_mm_gs.png"
try:
img_mm_gs = Image.new('RGB', (224, 224), color = 'lightcoral')
d_mm_gs = ImageDraw.Draw(img_mm_gs)
d_mm_gs.text((10,10), "Multimodal", fill=(0,0,0))
img_mm_gs.save(dummy_img_path_mm_gs)
clip_extractor = DLFeatExtractor(model_name="clip_vit_b32", task_type="multimodal_image_text")
print("Successfully initialized clip_vit_b32 extractor for multimodal image-text.")
multimodal_data_gs = [
(dummy_img_path_mm_gs, "A light coral square with text."),
(dummy_img_path_mm_gs, "Another description of the same image example.")
]
multimodal_features_gs = clip_extractor.transform(multimodal_data_gs)
print(f"CLIP Image features shape: {multimodal_features_gs['image_features'].shape}")
# Example output for clip_vit_b32 with 2 image-text pairs: (2, 512)
print(f"CLIP Text features shape: {multimodal_features_gs['text_features'].shape}")
# Example output: (2, 512)
except ImportError:
print("Pillow (for ImageDraw) not found, skipping image creation part of multimodal snippet.")
except Exception as e_mm_imgtxt:
print(f"Error in multimodal (image-text) snippet: {e_mm_imgtxt}")
finally:
if os.path.exists(dummy_img_path_mm_gs):
os.remove(dummy_img_path_mm_gs)
except Exception as e_init_mm_imgtxt:
print(f"Could not initialize multimodal (image-text) extractor: {e_init_mm_imgtxt}")
print("Ensure dependencies for multimodal models (like transformers) are installed.")
Next Steps
Explore the Model Zoo in the full documentation for a list of all available models, their capabilities, and source libraries.
Refer to the API Reference for detailed information on DLFeatExtractor methods and other utilities.
Run the self-tests using from DLFeat import run_self_tests; run_self_tests() to check model availability in your environment.