HuPER: A Human-Inspired Framework for Phonetic Perception
Paper
•
2602.01634
•
Published
A CTC phone recognizer fine-tuned from WavLM-Large that maps 16 kHz speech audio to an ARPAbet phone sequence. See the HuPER paper for details: arXiv:2602.01634.
pip install -U transformers torchaudio
import torch
import torchaudio
from transformers import Wav2Vec2Processor, WavLMForCTC
repo_id = "huper29/huper_recognizer"
processor = Wav2Vec2Processor.from_pretrained(repo_id)
model = WavLMForCTC.from_pretrained(repo_id)
model.eval()
waveform, sr = torchaudio.load("sample.wav")
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
if sr != 16000:
waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
pred_ids = torch.argmax(logits, dim=-1)[0].tolist()
blank_id = processor.tokenizer.pad_token_id
phone_tokens = []
prev = None
for token_id in pred_ids:
if token_id != blank_id and token_id != prev:
token = model.config.id2label.get(token_id, processor.tokenizer.convert_ids_to_tokens(token_id))
if token not in {"<PAD>", "<UNK>", "<BOS>", "<EOS>", "|"}:
phone_tokens.append(token)
prev = token_id
print(" ".join(phone_tokens))
@article{guo2026huper,
title = {HuPER: A Human-Inspired Framework for Phonetic Perception},
author = {Guo, Chenxu and Lian, Jiachen and Liu, Yisi and Huang, Baihe and Narayanan, Shriyaa and Cho, Cheol Jun and Anumanchipalli, Gopala},
journal = {arXiv preprint arXiv:2602.01634},
year = {2026}
}
Base model
microsoft/wavlm-large