add: app:get-audio

Script command to crawl RSS feed and find uploaded recordings to update the notes
This commit is contained in:
2026-01-16 13:54:45 -05:00
parent 50cf4800fd
commit b14a0c23f6

View File

@@ -0,0 +1,282 @@
<?php
namespace App\Command;
use App\Entity\Note;
use Doctrine\ORM\EntityManagerInterface;
use Symfony\Component\Console\Attribute\AsCommand;
use Symfony\Component\Console\Command\Command;
use Symfony\Component\Console\Input\InputInterface;
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;
use Symfony\Component\Console\Style\SymfonyStyle;
use Symfony\Contracts\HttpClient\HttpClientInterface;
#[AsCommand(
name: 'app:get-audio',
description: 'Finds Notes with missing recordings and matches them to RSS feed by Date and Title.',
)]
class GetAudioCommand extends Command
{
public function __construct(
private EntityManagerInterface $entityManager,
private HttpClientInterface $httpClient
) {
parent::__construct();
}
protected function configure(): void
{
$this->addOption('dry-run', null, InputOption::VALUE_NONE, 'No DB changes.');
// No specific --debug flag needed, we will output verbose logs by default for now
}
protected function execute(InputInterface $input, OutputInterface $output): int
{
$io = new SymfonyStyle($input, $output);
$isDryRun = $input->getOption('dry-run');
$noteRepository = $this->entityManager->getRepository(Note::class);
$io->title("Starting Audio Matcher");
// 1. Fetch Notes
$qb = $noteRepository->createQueryBuilder('n')
->leftJoin('n.user', 'u')
->addSelect('u')
->where('n.recording IS NULL OR n.recording = :empty')
->andWhere('u.homeChurchRSS IS NOT NULL')
->orderBy('n.date', 'DESC') // <--- Added Sort Here
->setParameter('empty', '');
//$query = $qb->getQuery();
//print ($query->getSql());
$notesMissingAudio = $qb->getQuery()->getResult();
$count = count($notesMissingAudio);
$io->text("Found $count notes in database missing audio.");
if ($count === 0) {
return Command::SUCCESS;
}
// 2. Group by User
$notesByUser = [];
foreach ($notesMissingAudio as $note) {
$userId = (string) $note->getUser()->getId();
$notesByUser[$userId]['user'] = $note->getUser();
$notesByUser[$userId]['notes'][] = $note;
}
// 3. Process Per User
foreach ($notesByUser as $userId => $data) {
$user = $data['user'];
$userNotes = $data['notes'];
$rssUrl = $user->getHomeChurchRSS();
$io->section("User: {$user->getEmail()} (Notes: " . count($userNotes) . ")");
$io->text("Fetching RSS: $rssUrl");
try {
// Pass $io to helper for debug output
$rssItems = $this->fetchRssItems($rssUrl, $io);
if (empty($rssItems)) {
$io->warning("RSS feed was empty or failed to parse.");
continue;
}
$matchCount = 0;
foreach ($userNotes as $note) {
if (!$note->getDate()) {
$io->text(" > Note ID {$note->getId()} skipped (No Date)");
continue;
}
$noteDateString = $note->getDate()->format('Y-m-d');
$noteTitle = $note->getTitle();
$io->text("---------------------------------------------------");
$io->text("Checking Note: [$noteDateString] '$noteTitle'");
$bestMatch = null;
$highestConfidence = 0;
foreach ($rssItems as $item) {
// DEBUG: Show Date Comparison
if ($item['date_string'] !== $noteDateString) {
// Uncomment the line below if you want to see EVERY failed date comparison (can be noisy)
// $io->text(" - REJECTED: Date mismatch (RSS: {$item['date_string']})");
continue;
}
// DEBUG: Show Score Calculation
$confidence = $this->calculateConfidence($note, $item);
$io->text(sprintf(
" - DATE MATCHED. Score: %d%%. RSS Title: '%s'",
$confidence,
$item['title']
));
if ($confidence >= 80 && $confidence > $highestConfidence) {
$highestConfidence = $confidence;
$bestMatch = $item;
}
}
if ($bestMatch) {
$matchCount++;
$io->success("Match Found! ($highestConfidence%)");
if (!$isDryRun) {
$note->setRecording($bestMatch['url']);
}
} else {
$io->text(" > No match found for this note.");
}
}
if (!$isDryRun) {
$this->entityManager->flush();
}
if ($matchCount > 0) {
$io->success("Found $matchCount matches");
}
} catch (\Exception $e) {
$io->error("Error: " . $e->getMessage());
}
}
return Command::SUCCESS;
}
/**
* Recursively fetches RSS items if pagination links are present.
*/
private function fetchRssItems(string $startUrl, SymfonyStyle $io): array
{
$items = [];
$nextUrl = $startUrl;
$pageCount = 0;
$maxPages = 20; // Safety brake to prevent infinite loops
do {
$pageCount++;
$io->text(" > Fetching Feed Page $pageCount: $nextUrl");
try {
$response = $this->httpClient->request('GET', $nextUrl);
$content = $response->getContent();
// Suppress warnings for malformed XML
$xml = @simplexml_load_string($content);
if ($xml === false) {
$io->warning("XML Parsing Failed on page $pageCount");
break;
}
} catch (\Exception $e) {
$io->warning("HTTP Request Failed on page $pageCount: " . $e->getMessage());
break;
}
// 1. Parse Items on this page
$pageItemsCount = 0;
foreach ($xml->channel->item as $item) {
$namespaces = $item->getNamespaces(true);
$speaker = '';
// Speaker Logic
if (isset($namespaces['itunes'])) {
$itunes = $item->children($namespaces['itunes']);
$speaker = (string) ($itunes->author ?? '');
}
if (empty($speaker) && isset($namespaces['dc'])) {
$dc = $item->children($namespaces['dc']);
$speaker = (string) ($dc->creator ?? '');
}
if (empty($speaker)) {
$speaker = (string) ($item->author ?? '');
}
// Date Parsing
$dateString = null;
if (isset($item->pubDate)) {
try {
$dt = new \DateTimeImmutable((string)$item->pubDate);
$dateString = $dt->format('Y-m-d');
} catch (\Exception $e) {
// ignore bad date
}
}
$items[] = [
'title' => (string) $item->title,
'speaker' => $speaker,
'url' => (string) ($item->enclosure['url'] ?? ''),
'date_string' => $dateString,
];
$pageItemsCount++;
}
$io->text(" Found $pageItemsCount items on this page.");
// 2. Look for "Next Page" link (RFC 5005 / Atom)
$nextUrl = null;
// Get namespaces on the <channel> element
$namespaces = $xml->channel->getNamespaces(true);
if (isset($namespaces['atom'])) {
$atom = $xml->channel->children($namespaces['atom']);
foreach ($atom->link as $link) {
// We are looking for <atom:link rel="next" href="..." />
$attributes = $link->attributes();
if (isset($attributes['rel']) && (string)$attributes['rel'] === 'next') {
$nextUrl = (string)$attributes['href'];
break;
}
}
}
// Fallback: Check for raw <link rel="next"> if atom ns missing (rare but happens)
if (!$nextUrl && property_exists($xml->channel, 'link')) {
foreach ($xml->channel->link as $link) {
$attributes = $link->attributes();
if (isset($attributes['rel']) && (string)$attributes['rel'] === 'next') {
$nextUrl = (string)$attributes['href'];
break;
}
}
}
} while ($nextUrl && $pageCount < $maxPages);
$io->success(sprintf("Finished fetching. Total items: %d (across %d pages)", count($items), $pageCount));
return $items;
}
private function calculateConfidence(Note $note, array $rssItem): float
{
$noteTitle = $this->normalize($note->getTitle());
$rssTitle = $this->normalize($rssItem['title']);
$noteSpeaker = $this->normalize($note->getSpeaker()->getName() ?? '');
$rssSpeaker = $this->normalize($rssItem['speaker']);
similar_text($noteTitle, $rssTitle, $titlePercent);
if (!empty($noteSpeaker) && !empty($rssSpeaker)) {
similar_text($noteSpeaker, $rssSpeaker, $speakerPercent);
return ($titlePercent + $speakerPercent) / 2;
}
return $titlePercent;
}
private function normalize(string $input): string
{
return strtolower(trim($input));
}
}