add: app:get-audio
Script command to crawl RSS feed and find uploaded recordings to update the notes
This commit is contained in:
282
src/Command/GetAudioCommand.php
Normal file
282
src/Command/GetAudioCommand.php
Normal file
@@ -0,0 +1,282 @@
|
||||
<?php
|
||||
|
||||
namespace App\Command;
|
||||
|
||||
use App\Entity\Note;
|
||||
use Doctrine\ORM\EntityManagerInterface;
|
||||
use Symfony\Component\Console\Attribute\AsCommand;
|
||||
use Symfony\Component\Console\Command\Command;
|
||||
use Symfony\Component\Console\Input\InputInterface;
|
||||
use Symfony\Component\Console\Input\InputOption;
|
||||
use Symfony\Component\Console\Output\OutputInterface;
|
||||
use Symfony\Component\Console\Style\SymfonyStyle;
|
||||
use Symfony\Contracts\HttpClient\HttpClientInterface;
|
||||
|
||||
#[AsCommand(
|
||||
name: 'app:get-audio',
|
||||
description: 'Finds Notes with missing recordings and matches them to RSS feed by Date and Title.',
|
||||
)]
|
||||
class GetAudioCommand extends Command
|
||||
{
|
||||
public function __construct(
|
||||
private EntityManagerInterface $entityManager,
|
||||
private HttpClientInterface $httpClient
|
||||
) {
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
protected function configure(): void
|
||||
{
|
||||
$this->addOption('dry-run', null, InputOption::VALUE_NONE, 'No DB changes.');
|
||||
// No specific --debug flag needed, we will output verbose logs by default for now
|
||||
}
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$io = new SymfonyStyle($input, $output);
|
||||
$isDryRun = $input->getOption('dry-run');
|
||||
$noteRepository = $this->entityManager->getRepository(Note::class);
|
||||
|
||||
$io->title("Starting Audio Matcher");
|
||||
|
||||
// 1. Fetch Notes
|
||||
$qb = $noteRepository->createQueryBuilder('n')
|
||||
->leftJoin('n.user', 'u')
|
||||
->addSelect('u')
|
||||
->where('n.recording IS NULL OR n.recording = :empty')
|
||||
->andWhere('u.homeChurchRSS IS NOT NULL')
|
||||
->orderBy('n.date', 'DESC') // <--- Added Sort Here
|
||||
->setParameter('empty', '');
|
||||
//$query = $qb->getQuery();
|
||||
|
||||
//print ($query->getSql());
|
||||
|
||||
$notesMissingAudio = $qb->getQuery()->getResult();
|
||||
$count = count($notesMissingAudio);
|
||||
$io->text("Found $count notes in database missing audio.");
|
||||
|
||||
if ($count === 0) {
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
// 2. Group by User
|
||||
$notesByUser = [];
|
||||
foreach ($notesMissingAudio as $note) {
|
||||
$userId = (string) $note->getUser()->getId();
|
||||
$notesByUser[$userId]['user'] = $note->getUser();
|
||||
$notesByUser[$userId]['notes'][] = $note;
|
||||
}
|
||||
|
||||
// 3. Process Per User
|
||||
foreach ($notesByUser as $userId => $data) {
|
||||
$user = $data['user'];
|
||||
$userNotes = $data['notes'];
|
||||
$rssUrl = $user->getHomeChurchRSS();
|
||||
|
||||
$io->section("User: {$user->getEmail()} (Notes: " . count($userNotes) . ")");
|
||||
$io->text("Fetching RSS: $rssUrl");
|
||||
|
||||
try {
|
||||
// Pass $io to helper for debug output
|
||||
$rssItems = $this->fetchRssItems($rssUrl, $io);
|
||||
|
||||
if (empty($rssItems)) {
|
||||
$io->warning("RSS feed was empty or failed to parse.");
|
||||
continue;
|
||||
}
|
||||
|
||||
$matchCount = 0;
|
||||
|
||||
foreach ($userNotes as $note) {
|
||||
if (!$note->getDate()) {
|
||||
$io->text(" > Note ID {$note->getId()} skipped (No Date)");
|
||||
continue;
|
||||
}
|
||||
|
||||
$noteDateString = $note->getDate()->format('Y-m-d');
|
||||
$noteTitle = $note->getTitle();
|
||||
$io->text("---------------------------------------------------");
|
||||
$io->text("Checking Note: [$noteDateString] '$noteTitle'");
|
||||
|
||||
$bestMatch = null;
|
||||
$highestConfidence = 0;
|
||||
|
||||
foreach ($rssItems as $item) {
|
||||
// DEBUG: Show Date Comparison
|
||||
if ($item['date_string'] !== $noteDateString) {
|
||||
// Uncomment the line below if you want to see EVERY failed date comparison (can be noisy)
|
||||
// $io->text(" - REJECTED: Date mismatch (RSS: {$item['date_string']})");
|
||||
continue;
|
||||
}
|
||||
|
||||
// DEBUG: Show Score Calculation
|
||||
$confidence = $this->calculateConfidence($note, $item);
|
||||
$io->text(sprintf(
|
||||
" - DATE MATCHED. Score: %d%%. RSS Title: '%s'",
|
||||
$confidence,
|
||||
$item['title']
|
||||
));
|
||||
|
||||
if ($confidence >= 80 && $confidence > $highestConfidence) {
|
||||
$highestConfidence = $confidence;
|
||||
$bestMatch = $item;
|
||||
}
|
||||
}
|
||||
|
||||
if ($bestMatch) {
|
||||
$matchCount++;
|
||||
$io->success("Match Found! ($highestConfidence%)");
|
||||
if (!$isDryRun) {
|
||||
$note->setRecording($bestMatch['url']);
|
||||
}
|
||||
} else {
|
||||
$io->text(" > No match found for this note.");
|
||||
}
|
||||
}
|
||||
|
||||
if (!$isDryRun) {
|
||||
$this->entityManager->flush();
|
||||
}
|
||||
|
||||
if ($matchCount > 0) {
|
||||
$io->success("Found $matchCount matches");
|
||||
}
|
||||
|
||||
} catch (\Exception $e) {
|
||||
$io->error("Error: " . $e->getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively fetches RSS items if pagination links are present.
|
||||
*/
|
||||
private function fetchRssItems(string $startUrl, SymfonyStyle $io): array
|
||||
{
|
||||
$items = [];
|
||||
$nextUrl = $startUrl;
|
||||
$pageCount = 0;
|
||||
$maxPages = 20; // Safety brake to prevent infinite loops
|
||||
|
||||
do {
|
||||
$pageCount++;
|
||||
$io->text(" > Fetching Feed Page $pageCount: $nextUrl");
|
||||
|
||||
try {
|
||||
$response = $this->httpClient->request('GET', $nextUrl);
|
||||
$content = $response->getContent();
|
||||
|
||||
// Suppress warnings for malformed XML
|
||||
$xml = @simplexml_load_string($content);
|
||||
|
||||
if ($xml === false) {
|
||||
$io->warning("XML Parsing Failed on page $pageCount");
|
||||
break;
|
||||
}
|
||||
} catch (\Exception $e) {
|
||||
$io->warning("HTTP Request Failed on page $pageCount: " . $e->getMessage());
|
||||
break;
|
||||
}
|
||||
|
||||
// 1. Parse Items on this page
|
||||
$pageItemsCount = 0;
|
||||
foreach ($xml->channel->item as $item) {
|
||||
$namespaces = $item->getNamespaces(true);
|
||||
$speaker = '';
|
||||
|
||||
// Speaker Logic
|
||||
if (isset($namespaces['itunes'])) {
|
||||
$itunes = $item->children($namespaces['itunes']);
|
||||
$speaker = (string) ($itunes->author ?? '');
|
||||
}
|
||||
if (empty($speaker) && isset($namespaces['dc'])) {
|
||||
$dc = $item->children($namespaces['dc']);
|
||||
$speaker = (string) ($dc->creator ?? '');
|
||||
}
|
||||
if (empty($speaker)) {
|
||||
$speaker = (string) ($item->author ?? '');
|
||||
}
|
||||
|
||||
// Date Parsing
|
||||
$dateString = null;
|
||||
if (isset($item->pubDate)) {
|
||||
try {
|
||||
$dt = new \DateTimeImmutable((string)$item->pubDate);
|
||||
$dateString = $dt->format('Y-m-d');
|
||||
} catch (\Exception $e) {
|
||||
// ignore bad date
|
||||
}
|
||||
}
|
||||
|
||||
$items[] = [
|
||||
'title' => (string) $item->title,
|
||||
'speaker' => $speaker,
|
||||
'url' => (string) ($item->enclosure['url'] ?? ''),
|
||||
'date_string' => $dateString,
|
||||
];
|
||||
$pageItemsCount++;
|
||||
}
|
||||
|
||||
$io->text(" Found $pageItemsCount items on this page.");
|
||||
|
||||
// 2. Look for "Next Page" link (RFC 5005 / Atom)
|
||||
$nextUrl = null;
|
||||
|
||||
// Get namespaces on the <channel> element
|
||||
$namespaces = $xml->channel->getNamespaces(true);
|
||||
|
||||
if (isset($namespaces['atom'])) {
|
||||
$atom = $xml->channel->children($namespaces['atom']);
|
||||
foreach ($atom->link as $link) {
|
||||
// We are looking for <atom:link rel="next" href="..." />
|
||||
$attributes = $link->attributes();
|
||||
if (isset($attributes['rel']) && (string)$attributes['rel'] === 'next') {
|
||||
$nextUrl = (string)$attributes['href'];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: Check for raw <link rel="next"> if atom ns missing (rare but happens)
|
||||
if (!$nextUrl && property_exists($xml->channel, 'link')) {
|
||||
foreach ($xml->channel->link as $link) {
|
||||
$attributes = $link->attributes();
|
||||
if (isset($attributes['rel']) && (string)$attributes['rel'] === 'next') {
|
||||
$nextUrl = (string)$attributes['href'];
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} while ($nextUrl && $pageCount < $maxPages);
|
||||
|
||||
$io->success(sprintf("Finished fetching. Total items: %d (across %d pages)", count($items), $pageCount));
|
||||
|
||||
return $items;
|
||||
}
|
||||
|
||||
private function calculateConfidence(Note $note, array $rssItem): float
|
||||
{
|
||||
$noteTitle = $this->normalize($note->getTitle());
|
||||
$rssTitle = $this->normalize($rssItem['title']);
|
||||
|
||||
$noteSpeaker = $this->normalize($note->getSpeaker()->getName() ?? '');
|
||||
$rssSpeaker = $this->normalize($rssItem['speaker']);
|
||||
|
||||
similar_text($noteTitle, $rssTitle, $titlePercent);
|
||||
|
||||
if (!empty($noteSpeaker) && !empty($rssSpeaker)) {
|
||||
similar_text($noteSpeaker, $rssSpeaker, $speakerPercent);
|
||||
return ($titlePercent + $speakerPercent) / 2;
|
||||
}
|
||||
|
||||
return $titlePercent;
|
||||
}
|
||||
|
||||
private function normalize(string $input): string
|
||||
{
|
||||
return strtolower(trim($input));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user