Majestrino 0.11 Alpha

Majestrino 0.11 Alpha is a contrastive audio-text model accompanied by a suite of downstream MLPs for audio quality assessment, speaking speed estimation, and synthetic voice detection.

Base Architecture (CLAP)

The base model is a CLAP (Contrastive Language-Audio Pretraining) architecture designed to map audio and text into a shared latent space (768-dim).

Audio Encoder: laion/BUD-E-Whisper (Whisper encoder pre-trained on captioning/emotional speech).
Text Encoder: Frozen Alibaba-NLP/gte-base-en-v1.5.
Projection Head: Linear -> GELU -> Linear (768 dimensions).

Training Data & Strategy

Dataset: ~7 million audio-text pairs covering a wide distribution of emotions.
Languages: Primarily English, German, French, and Spanish.
Pre-training: Contrastive loss training for 3 epochs.
Instruction Tuning (Epoch 4): Fine-tuned on a subset of several hundred thousand samples to understand user intent:
- 50% Paraphrased Tags (Keyword search style).
- 40% Natural Language Instructions (Voice acting prompts).
- 10% Original Captions.

Downstream Models (MLPs)

We trained several Multi-Layer Perceptrons (MLPs) on top of the frozen Majestrino CLAP embeddings to perform specific analysis tasks.

1. Audio Quality & Aesthetics

Distilled from existing scoring models into the Majestrino embedding space.

Content Enjoyment: Distilled from Meta Audio Aesthetics. Works cross-lingually (verified EN/DE).
MOS Scores: Distilled from Microsoft DNS MOS. Includes:
- score_overall_quality
- score_speech_quality
- score_background_quality

2. Speaking Speed (CPS)

Metric: Characters Per Second (CPS).
Function: Estimates talking speed directly from the audio embedding.

3. Real vs. AI Classifier

A binary classifier trained to distinguish between human speech and synthetic generation.

Training Data: 110k samples AI speech (11Labs, GPT-4o Audio, Gemini 2.5 Pro TTS) & 110k samples real human speech.
Robustness Strategy:
- Real human snippets were processed via Resemble Enhance and then Voice Conversion (Chatterbox-based) from itself to itself (Self-Voice Conversion). This removes "in-the-wild" background atmos and introduces digital artifacts to "real" samples, preventing the model from simply detecting audio compression/quality artifacts.
- Augmentations: 50% of samples (both Real and AI) were overlaid with background music, vintage effects, echo, or phase distortion.
Goal: Forces the model to learn intrinsic voice generation characteristics rather than background noise or codec artifacts.

Usage

1. Setup

pip install torch transformers safetensors librosa numpy

2. Model Definition

You need the wrapper classes to load the weights.

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import WhisperModel, AutoModel

class MajestrinoCLAP(nn.Module):
    def __init__(self):
        super().__init__()
        # Audio Encoder
        self.whisper = WhisperModel.from_pretrained("laion/BUD-E-Whisper")
        self.audio_encoder = self.whisper.encoder
        
        # Projection Head
        input_dim = self.whisper.config.d_model # 1024 usually
        self.projector = nn.Sequential(
            nn.Linear(input_dim, 2048),
            nn.GELU(),
            nn.Linear(2048, 768)
        )
        
    def encode_audio(self, features):
        # features shape: [batch, 80, 3000]
        out = self.audio_encoder(features).last_hidden_state.mean(dim=1)
        return F.normalize(self.projector(out), p=2, dim=1)

class PredictorMLP(nn.Module):
    """Standard MLP for Quality/Real-AI tasks"""
    def __init__(self, input_dim=768, output_dim=1, sigmoid=True):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(256, 256), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(128, output_dim)
        )
        self.sigmoid = sigmoid
        self.output_act = nn.Sigmoid() if sigmoid else nn.Identity()

    def forward(self, x):
        return self.output_act(self.net(x))

class CPSPredictor(nn.Module):
    """Architecture for Speaking Speed"""
    def __init__(self, input_dim=768):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512), nn.LayerNorm(512), nn.GELU(), nn.Dropout(0.1),
            nn.Linear(512, 256), nn.LayerNorm(256), nn.GELU(), nn.Dropout(0.1),
            nn.Linear(256, 1) # Raw regression output
        )
    def forward(self, x): return self.net(x)

3. Inference Example

import torch
import torchaudio
from safetensors.torch import load_file
from transformers import WhisperFeatureExtractor
from huggingface_hub import hf_hub_download

# Config
REPO_ID = "ChristophSchuhmann/Majestrino_0.11_alpha"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# 1. Load Base CLAP
print("Loading CLAP...")
clap_model = MajestrinoCLAP().to(DEVICE).eval()
# Load safetensors from Hub
clap_path = hf_hub_download(REPO_ID, "model.safetensors")
clap_model.load_state_dict(load_file(clap_path), strict=False)

# Audio Processor
processor = WhisperFeatureExtractor.from_pretrained("laion/BUD-E-Whisper")

# 2. Load Downstream Heads
heads = {}

# A. Real vs AI (Binary)
path = hf_hub_download(REPO_ID, "best_real_vs_ai_model.pt")
heads['real_vs_ai'] = PredictorMLP(sigmoid=True).to(DEVICE).eval()
heads['real_vs_ai'].load_state_dict(torch.load(path, map_location=DEVICE))

# B. CPS (Speed)
path = hf_hub_download(REPO_ID, "best_cps_model_checkpoint.pt")
heads['cps'] = CPSPredictor().to(DEVICE).eval()
heads['cps'].load_state_dict(torch.load(path, map_location=DEVICE))

# C. Quality Scores
score_files = [
    "mlp_score_content_enjoyment.pt",
    "mlp_score_overall_quality.pt",
    "mlp_score_speech_quality.pt",
    "mlp_score_background_quality.pt"
]

for s_file in score_files:
    key = s_file.replace("mlp_", "").replace(".pt", "")
    path = hf_hub_download(REPO_ID, s_file)
    heads[key] = PredictorMLP(sigmoid=True).to(DEVICE).eval()
    heads[key].load_state_dict(torch.load(path, map_location=DEVICE))

# 3. Process Audio
def process_audio(audio_path):
    # Load and resample
    wav, sr = torchaudio.load(audio_path)
    if sr != 16000:
        wav = torchaudio.transforms.Resample(sr, 16000)(wav)
    if wav.shape[0] > 1:
        wav = wav.mean(dim=0, keepdim=True)
    
    # Extract Mel specs
    inputs = processor(wav.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
    input_features = inputs.input_features.to(DEVICE)
    
    # Get Embedding
    with torch.no_grad():
        embedding = clap_model.encode_audio(input_features)
        
    return embedding

# 4. Run Inference
audio_file = "test_audio.wav"  # Replace with your file
emb = process_audio(audio_file)

results = {}
with torch.no_grad():
    # Real vs AI
    is_real_prob = heads['real_vs_ai'](emb).item()
    results['is_real_prob'] = is_real_prob
    results['classification'] = "Real" if is_real_prob > 0.5 else "AI"
    
    # CPS
    results['cps'] = heads['cps'](emb).item()
    
    # Scores (Normalize 0-1 sigmoid to 1-5 or 1-10)
    # Enjoyment is typically 1-10, others 1-5
    for key, head in heads.items():
        if "score_" in key:
            raw = head(emb).item()
            if "content_enjoyment" in key:
                results[key] = raw * 9 + 1 # Scale 1-10
            else:
                results[key] = raw * 4 + 1 # Scale 1-5

print("-" * 30)
print(f"File: {audio_file}")
print(f"Real Probability: {results['is_real_prob']:.4f} ({results['classification']})")
print(f"Speaking Speed:   {results['cps']:.2f} chars/sec")
print(f"Content Enjoyment: {results.get('score_content_enjoyment', 0):.2f} / 10")
print(f"Overall Quality:   {results.get('score_overall_quality', 0):.2f} / 5")
print("-" * 30)

Downloads last month: -; Downloads are not tracked for this model. How to track

Safetensors

Model size

0.2B params

Tensor type

F32