From 75d17f8e954a8f6bf10919a48979348e4b163a31 Mon Sep 17 00:00:00 2001 From: dadams Date: Sat, 16 May 2026 17:05:59 -0700 Subject: [PATCH] got the ingest for energy eia data. created txt files of their descriptions --- .../ingest_eia_energy_layers.cpython-314.pyc | Bin 39938 -> 46499 bytes ingest_eia_energy_layers.py | 701 +++++++++++++----- output/facility_fuel_pending_narrative.txt | 132 ++++ .../operating_generator_capacity_sample.txt | 134 ++++ 4 files changed, 787 insertions(+), 180 deletions(-) create mode 100644 output/facility_fuel_pending_narrative.txt create mode 100644 output/operating_generator_capacity_sample.txt diff --git a/__pycache__/ingest_eia_energy_layers.cpython-314.pyc b/__pycache__/ingest_eia_energy_layers.cpython-314.pyc index 0756fadf4d8b5fa0858f78dc6aa1096b9387f6a2..fdbcaa1e8c92c7c34ee34060f41899a4b7ed6ace 100644 GIT binary patch delta 14117 zcmbt*33O9emf-(i`(kaDWXqN;8F`bj4Q4S08yka7Fka$(96`XgB4l9{geAUY7Kb1c zDk&34aCjs*%QU7!RY*;vxLIRakfIcF23PnXtoSM|>)Kq6$EE%Urx@+!x zU-Cj|NOg<&{ri@C-+kMCcX@g8Ec4o*nZmiMG&u#of4eSu?ES+O^}lf@^%Et5o5jx; zK2_O8vEs(`HTtmLO0kl#ft7}htSp?t%EOtgB5djuwk=9wPJi2$pxjb@*&>=R3Y$C4 zg~{5UY>F~dot9jRO-t2VsZMcQHL~s0qZ0>|!cI|J4P*==6+kP{epPxJlO&Xzs;g6{ zki&CW4Xb5!Y&xqiqvtX6=&+sDvj)~!hT*(WV!-MET|s!ePM6n3&*kh<7Mq3XXg?U7Y9`vkjh9=(bV7j#Mu z6pSR2L9qjil1!%NBV5Sthw>*gD8D9L#2x_n2LyJo|CN0Ta!=!&pAHwZf6x99@`K6z zT=r+|LC8Oo%$Kl-Aa|JHma>s&A@dx`l(9zu9wP8O_P?>uL+)r2X+DcGsBk%Z4AKSc z3sCbSX}6F)4)8F67qKq^{4#+Tv#&tQ3ieef`7fko3Hx^dpCE80I|8*!*^?R6hVU}> zM^Nz^si_;ms=XHNq{4f_V9wVf;2pH^<5sBYMGu=cgNRBDc3Q_*(v z=lJ>P2BAQpbI@^>cemHm7xub#y8V6L5ZWY36o`U>U0yCO^|-^{o&dKet|SGnpx@ma z4#j2O-tJ()hsLF)ql;zNXnF(EDK61z=u^cD0QV_h6h8@Dh&zC^X{)k+2w({B%@y1r zE`23S!>+KHVT~c5pIB`kH-`C^YvWFWXcth%Dz=MA0<9@ja zPF}>x{krs_P7QQ9x?j^sONRl&A?8NVN7@oPg8r(_q6bly&Mf*FRQOPpt_M1KP4@?S zA9_07QH?urh`6V4Klto(Vw^~DB822#BjwPa)4QPCwR$T_WDS!g_yBS7TLUuHvQ=*?_Rz+|z z`OOpBF^gnUFzL8ryL-2b^LheYcZhovO2{-NM-dnJdc*us@BqkA?llU$+@V@X2B=AD zB~9mDC+W2XDs4i5^Pg+=2@wWviK$H6DdDMgKbC!*QJj#FGASjNU6)~yJA|&A^qF>w z65f=;c;--6pvlrfK^GMib|F*NX$2IUDVDKQP8oV9+kkFn)iErCDzdwzxU^jvlpM{@ zKp$rtX*K%i>|7VfO*6ILP(lPYFbTh)+NtiMDwn~5nj+U>;Oi*LsYppir@HbOw1M?B zLn+HR)if1UtpDXgW;Zn;vQkb_hXDoc|4vMl-9hwv~9l8{>kP#8sRmZZRO*2TVwsMp&=4ysi3*Jqa#c*1HN1Lc z&1s~uwg^5^>QJxMPQEW%?J~&6EX7f27K&O+BC$;S$C;&rjWKoRaNXg3pJA1QiW}SO_?deCN)+IyA2~}29Rr-;%G$vIaTR60LEIsdIY2Kh@LSY=%9NYIb2+7uzN@Sg!RI}lw!uNYXJ09X%mqh!L#^OKHG|cHirCN0B+V2xL@4?)&syHO3gl2^NwZrU*iV$k~BZ z0k-EkW8W%K-J6LJgv822(2u zQ!;`ObceU5I0I?JUkdk~^)Jz}0-m$=%adGv4ahUOQgL2)H_y}-pWn;fhWamY8$5Rd z)Py2!ppme(rIMecuy=QuaPkH-eOujY>2IKaDJ)D*PeoBpMUs(k?_|W|cZWi73%mU8 zJzkFcE7aeBKkonGoY^j&>C5k?|n|O`vubt_>x%fJt39AlJqC-=bs>cgahF zk}|l0;7{q4YN*w>;4(|=P$j#j+BDb(P&uazgCM}@Glw31IoATp9EPkV3V}LVT33=R zeg6N*BSB3znEj6Q6iMflozOzFVz@aq-)W&q4S=-J0FBk;#?*LM;xB7ySNAbh z47qPXB{5T78wtD+!!3-JYMIC8wC8K?X!HxxD0ixn#i?jBq0jW`g_`yMwos7j6s}=$ z7hQtaFZm0KTXaq(-`1g8NY7{~x}&A23HGRr%K7ekMdVIJYDv36H-I?-!;Akp<*X7f z9hT-sll?wX%g8K><~el z6}|b4c>}gPoWiN0N-!Rq;S@vPnF|E46-+_x46d>WSJ{{=rxtd-Xqv1O4qD|8s2Rl3 zyNxzruUUs4-IZ4jKn}0xfM%pmjkCj;f{12B6f8VvRdi4t)L3Ag%_n^nCcrfR34gXU z-%3%jQWjz}&CJ`Ax)k&QbHM*V<06|hsGBhobGvT#sRdf%psrSz4ZDb+oDw`aHrNX@ zyWt!8=#w9Mw~4M7*6nf9wviYb(Z1Gavh&DI;R z;mZLku!-pHzRi^2cDsCGdL8K;4c9-% z-~_KlT<8h(_QsX?0(HB#V9hoX0=z&UR$hTU7Y`~xhYkDOe$axux&MVbxQ4L>z#D0{ z-NglVfw7Gzjt9Xe#zt6Nj*CLB$3lT#PK3J^LlT!@(%VBl+=oCWgk|!SYOI`AkU>GR zEwV(i0fStK)v#Uz)aMGrq3VJ91hbwNX+O`HcQs?-m5haF9=N(_&6P!K#xfcQl@l86 zpeUv{49a50>_Nrn^J-$s^s7qS6{QXI-&juZq(nGho}eJPqMSR}JSnI2)=v$lqwO)1 zCn!-9ZAl2k7TYa>)RdLbQ6@9AQA7)> z&u%>19L;MS&s;lcpt1@l1(Y!>kx5zYLyC#aEYP@d^1G~pA@Ma!?um^rZ#*p?v&R0ne1g{pKZ68{7M>Zj)^chEMM?yb}oL%^i{kK(3ALy=D zHAUOnqZ>B;wrz9NwfSn>=9u1cRiAf7pEu$;y)c?zHLkCQq7iN6h1TaAC#EVN8|fU^ zFN|pn!`c_*M@84IRCeLy97<3HX{&JoY28MbSg1u?TVmQ#MF_>4Ka zd}GwSDQ3=(<*$ns6n~`wSRM~j55tR zwJe%Zc8j86rfqpqecsTjSL_J^WN%Lzfw$wG7(r(|uKy;{O4BlJT&ejcAwY?WM~rCTvHP*Ik&8MZK>kjJSo8Mmef&D@m{F|((@TQoEFb%G6*jy>lQ%f*EObQ zq2&FHyry!+`_+0#KhQ{_HC8Ca^MXJ7x*aMez-QuG4tr*`rpsR<^wN| zK~K1+x%v|7Y%l|7{SCjQttCLfA(+7hnGO(-&^LGx(Sr}z@dw6v z=ON)5+a(;ROZV*ZHT23FFq1~#Tk{dUX)kUpOJA2zAU&T!60x zAyi?f2XAj`c=HHfdoE&IaZ91)Yom#^PYPIF2vP9n_r+eTPJ1=5HI?Dy`vwXa}VMH{^`MTDSdM^d&6Oo0dq7xN^S|x zE=O;57G+M44-=1zd-x$3(6^oQvIcRHif{!}!gIzo0`do=4P9MK!3s?0ersHTe)A~i z=lY|6WN0b6>gk~mquTBqaMN~mSI{-1FLXa6TsbvS+;U8`9+J2eCYxN<exq;YeKsH z&cdW`?(_M-&n&#+=NvU3ia z*p`d_?_7d{Y$fCicIm{nInV#I6d9@tkZM;E(;Vj&=mq$LB2|hSW&hGu;CKjoZSG#T ze@`e}x`p$BQ^&u@wgr?!8^70VTe!e>+uC&xw$4y3%lgOAK))XKACaT~u_q0!d3-L4 zJf7{9PpdK*a`$%gREU4Tw%k_eg~QXsJHEYsuP4m;JihQA7gkx_VQ?C{dT`u2AW2#Q7?TStqAom`{wj>+oPouK1K>9A0UdF=Rea%$ z2H%&z{xx?pbg$76ZrpE!jfHYo8CH7u1t3$M(&cCHa9>I`H`=k@i2m9yM*9w%=pIzQ zJ@cqng064K275woD6Od}gt_$vx(jXYkc|h?jtyDUwK!_zOIBkT?O18T1_+>M5OiGF z3jQ6Q0%>w(MRu|Y=^+$`U6iCj;2AWGmx2U6x2OPC|g&28$_@2ni zmURcfDE5N5@_KwBIN%sb=JkiXw%$I!AC)^zDiCw{JS4lsLlb(ypNU&8y(@2l-k$Hp zT=`v@o5ub=+^JLCCDD>Iuh0O;oO_pR%2+37|HKgyI`W*V1P2qpugt=-C(?f?DSdps z-QL}{zeVhTmi(%WO}@rA;?32pf68rG`#Za_WMDl zC8|;c>#STQBypL0r`zYpj(lEyg{zxaU$wl3gKyB3>gfW6Ay7|4S^yw6Axk@weK7lo z<*;Sg9+N7LC=P=6K$NDrC6LJE2@O>=_w~Ie_nz7}rmr};Z$vPxJ?MmON;6& zCetZ-1{job725xRF%r%6jw`psl5Q@s-=i!NH9s=0**LkKrc~Kr-pW-6 z^?$h`q6|w2pGUw*&V-sH#P*e|j_3#Vw?lQHl)aQ!KS%Vow%#H=r(ys+H^KDk)FIF^&(k@WA{DM@`V3K)YE-4yK)>qC4bB~xv*j{;q z=5X}nUU8GFs3?q$q=xVbvG_G;eEUl{+{UAiU!`xO*yF1pHSb-sb!s1QozPWW4jYUM zhrlPdmDl&dnGg2Mx`&#{TaLKAu^V5n;2z_6^;*L3ndvQ~1PT!M;dR8gILP&ZtB+&Q zmAzPZPwcgcU>2#XqlPD5rO{_<0}>VFF&zxjq#MxOZHmY|i15SyB*7$oh{!Dn$q{ly zSq!BDZP=blhOF;16(+C-Lpjh0hmcC?^y@fzOZe8hw`BB2Tjw{|zF*7ALX6K-jUx$zY?b2e>g)O+e2n1_}_dC8mQU zD28yBw4JJ=U34GQCvZ`V(B&oxm<}S8+g6q)a7rMc^g0BTWIVe%R^XID^dvcE!nkb? zW;`Puf?0)0M!VM&+kzjVerGN%L=`)iqf1~~?A>We7dd6Kz2!V{2P?jVxb%+DjI`+u z!aBOh&PkKnVUDcu7Zid|-(8iX2lu|7ci$%~#C!$QTnV;HekWk?)ecF2YdttuZAq`f zJYH>voeFkK$QIrTMvI$+TRsp5cQoX^HgMN+P!A?eYL5};G}zNn5H9Nl{UX`n|GEKlYfVK!ga&Kab8IJt^A>|^?|n>+dAMc=NV3la&8N>a>(NX z;ABcrM<{Yar%xrf6%%|KmWcCV=sZ9U1%Dg^P!N}KZ9`Znr5E&=!CePfJ2VjBLIMNTCIzw7mlfm z2G+#xdKU=sT_A?LKt&Z6zwSTjKh-;yzW8MCaA=r5zItf&$=*R}G=1@;7OsNp`X!Wo z?wDrP$)e%Bvqe|RRvy<6=}#67G)6V6$UBgKlBGSPkELmkY&y8G-#4<{m zKG6D?8{im#TM4sr#4>34_J*F)*3*;H8B+f_eX{ZGP%BK_sl0{)(K)HMAy;~?&cxtM zp|oL{@Z2IXr0+^it97Dxb61u%)XFZXrI5LhD}qKB3KR`XWEW}`tJP`0mgDkY>lCZ) zC{bVA!@2R&Y=-1sg1-?e~SkP~r=Q(?qAzgDUPKUN_>2(&|3a%P|%LXNXT#-gTfUQpi2(Vcmx zXi5~pqbnkdnC5?&_LAgBpg~gdi2GL%$piY+OhvgkwUR;$YY}&qSG@WV7xQ#x2r3CkCfre6lI)JirP*(WfM`5>o7Wq3 z1$zAv>}SJMOEhI(Vc?#CtxnDZJfeWzJ&{%gug2~wjKpagkV7i{VB;;V3}lj?Dsa5s zakAr7*O+e6$*y6~(2nC3Llq~x1|?D5qR9*p1c@w4tr>XWQ(o{yW=WoWkUYGgxu(xJ z{=m@(qUQQBeZy6K^A&ybH3+7uURI6tk7X{nDPRo7TLPh0cS}xbjnT}?vsq`AQDein zW_3)HkrYa5%33f(C>fJN$wGm6#H@e@+BumsTNv?;ZiiOF&|fL2r$y&z?W$_&IUR%H zB2#^#=v}(bQLmT1S1g6hd(|SSI8Q6;wX*YiMSWh{c`GhIU#O_BL_go>QL;i-R0gLL zUNJ~eTO?O2?U;J10B_)BtQ?n06s(d>V^wGbU^T#+B&-Ek2k-}xA7wlVO!do4OH)sk zIV^{`Kcc+@3+Zum1{?z8xL^P&_CG=oAn*Ps(AWEm7n3JLa{j3nEOrPULW@a`Bd?}- zf+Y{ig2E1XF2MiRKbBIQf(oYs<^Ir=uDC-6QEiLR zzdn@>FZn@X>sXS)oR(?u$VUm78MWz7DD0es=iwqOoO*RK*OMSrp>0| z6t^u;K28#;oT|23${2)d42{j4N%TuYhM4HyT~G5m?q^fG_(vYqt_tokUbwe zLhkE0Mcao=5l4ybz70umf4O50zumB|#6L9@asVed1jyl-UR)8{<_p4eT6h{8=I~WZ?|COrkj|sBmcGLVT9EF99vCOTOO= zxj+CXJEAc0tZs?)IRX7H&FFU>Kv@fcw z99J%#P^Y7RLd9jDOHIRh<5KH2OXXSpn571u3&#`|h_Wkk2}(tDNn^BPZB*AZu56w_ z=UVmXHaS>)TBQp&cTP73vJ62@eUXO*`@9+;pD z`ur9RbzxbZp{a^_U#)=5`&nttOT_ONE&})ibDbSNzp1U$HZK=l)R-W1u}aarNOVa& zr+J>}Qjr8oF3l6+l0`BMFITjr$u3K!0AJR$>RRRq;!>9jPq@qFU|L%p%%7lNsufP> zAK?q1mth?ob#aNy)gAD-T-*=v+Jkk;zbfa7aQ5$T_Aq+l#TDxcaQ=VdJaH)8F_LO;$Q0&<3>1+$4B(ADM8*3a}Er>gR!TM8uu)^{h}lCGA{fACvx28w-{Q7 zAspf6MsPyT+C>cg0Vm{K;s?IuK`nXA%AcmGlQ!dAktisRtCAQdA11`HJpNQ|r&j=o zO&|V%k-GT|ESQ{0yK7fvEcVqF_I1ks+2LQZm2aT zO`lL!Bt!zEJRuUR#0h~^!we|l4UK5tnXI#$qVpRQ0>;clsmz3&w!qhvkdFRk_z;te z|0j#dwTyw*e>I<>HDAdYT7?tMq#VDVz0r?Oy;8p7Mk`%U)0^n8TA5l}_fVP0+Gg)?S&Ep8d8jB;S~4EevLD&MHcj_GL4SmD!w~$B05T8!xmmn-%S~ zvI>5|w-DY`vJuy_K>A1L5&;})xce6fOGiVo6#8lBB8pSlxG@q;BL^IIo zKwHENN@oI{B^u{5;&U6=3a)|=<^eBTw9eL3(gTUD%oIu1w*22he=!_)@^0Q zrD8q6265R|78onJj&$*6u^Wmui(3FX#9OwqtB~~^*AWuGC4L(UL*g!gcZzo*OZ<*_ zcLjWdb2qEZ40QP(3nT8XPIRa`F(knP@m?q&wlLn*V4=7N@Vyi-5`Q3$0CgWyUN%@P zJ}=%6^aBaHMEs$+59s|shfttYJOJc_wDLTW0R9lg%fyEP|1QPL#R>5dphl5`I`hSY z7A9CBJ_@i>JOrG_sN4eaFyN0T5at7UfK51uEmn?JiNuHjl+C9+Y_jv|_fuVUxjwiUYB8iv#eLLJzxzF#GoI#Io z!09dTaR!}3*+~HumV&+t$=wHR%8bfsrKm=rUx(l8BzLOHtA{eLuGH&vdAv}&ym!#; zjq-iYK-37p;Th?#bs>7g=^28PS|(1=6K`81N6Y-&NcL(=F{Ldxce5 zdjSpL;$I<+VClD$U>o_9kf(Tnl0>g-Uvwv+QTaEW-a&U%A-O^SfOIdEm9Wwt_)sNt z;!5trS`-kX%V!^${Hd;#EgnS$N|)E|9B}spaOD-+seDu!@DBEQdUr}sVnw1(X=NY4 zHY>4B=`I8Z5qw>9JIT8#*xcomW>$3wYLqCXCrOK;tO0dIWm}yAu)W9A=MDq|JQNk$ z6BeWgg3f^+r_|#aayx>~tzNfdz}fGXCO~K>dC|}Z23d{&#okH&X)IBpF(qb^1P zmUcSa9;d_U_ef`;?5@d6=?WI6u-sTnY#B?*oD36r)|ke1Ysl@!Iiw{+2vx}P(Iz@# zs5h$Zci!fZ+%BKg6OdlSMyS~eX&AXYtXfHqr7{Ex1WEv3HJdo1fffL}m}#bto%`b>Y*<`mVi7n(5i3V-ZVZT0yX&vw|OZgne!i=^WC^gsCKyQyik(;2o4bk350OCIq6oUXK(AIwao?sNN?< zsJ5Ul==8#g7Yw8TU04X4dG;4SSPZ>5EOaj=cfub*mM-L7EUvDYz!=QCnj=b`e%G4F z+(NP>x7QHK&KtY_v8DT~!@|PxwE)P`+5vdMq$jbLfuo+t1tTKy0*rv?ZKY92SZ%X-&V;+K+7;TBant#NWmL)g zTK(`H=d{+*o#(a1wB(XX|4{yZ?>S@sJF5H<#ivu5j3Mi>x^PP26vL`>A{o}b%`qOZ z%K=;_Crb@YCgwF$T2m?iS_#|4mGY#&z*SGr0oW{wUC-0Gi|=(j;=gK}N64f0VmL)l z+E20b^r+$|2yoCO)Ci{xJ(9nZERgO-Y&ZGwoPttZGdQu}wp?)5I{kiJE7)XI9`J+9 zl3oG4L@5bGmY;8-D+IL#@bF2@pfp03<)_Q1pd>-rTVFU^I1k)?$;!WNXL?E_sf4*!4;bzX(gZexMj^eGg?;Qz;{brH*ol+$>#X>M7z^W8+}Mz?;0jbuS^#KNFS(sP4%$Yq z$Lp5<4+=j)862Zs%q6XES2H!Kw>CZ=LD-Vil4rR1Jfr1bHN}@ne^F6lC|)dDwm4zP zKd})O01O4dsNUsu1_I!i9bV^7w=iqY@O8{^;8f%y0Q&5pRhRh}bxU}`}NOoi;@ z+Z1GJNf$X%Qm}-r!O4N2U*0J%ila=Qou1SvPk$npJlkzdHvDz+?K+6~tqPF`hryEr zPBYJg%P=&T%N3o9b_4mw5>t97D=?zGil41ZSucpDDr8k%rBqbnUvFtgjX=9m3~?UZg^WC`4uE(7|+;rlkmUpmkOJjc;{o7V>1|L91fD zCCLuO+4?FTm~1ko0e`OC!j|b06O03>1P2Qfpk{KvF#MmNj!s!}rS!n2gFdBpsbH=R zRXp_B=+}48uAH>uE+w!`Xk(`yW_>11pe8W2a5~N|C?TJ*Gd3oN!G%T6-a8k(e(OO* zLV!FKcjyoTWF+nG&Prl51fb)I%V#Moy_^F}qm@sF0Z_}Dl!ag+8}0V%uCg?xLwD_> z1S^z&-Etj_%S((lL3mgXHH>ThC7RXQ`gYPNG&?^Lb9}C`g&c@eAp4hz&xx4GPYOh+L?lfj8D$r0d) zMrAJFz(7<$cj9#PdfZ+}9sUy;ylXHdL2JIjfnZb(<~iahB!ZY=L^as+fMa{WHz27{ zof^Tfv8ocK`vZMn9;I14{eGVmj5`@e(C6^`0>M5{K*|Co)H{3znLub4^S;(RQon0; zjB8`%hW9NM7c2|US{6>My- zBsJ|3?E!6!V?&uWk($N{Z;X+zV$CsLVa}N1RR&|s#28E{6V9!k*fh}^wlOFWxGZcf8(noY zA0p<=@+(uNjJ05j;b5>ca>DlNiR&j;g>!4qXDs=BhW$!R4vfpwtKxc-TCc>mu+Zdh zh14rkDP(C)d#D^%<$aBs78~=6jMVFvGruUMcy(rTw)|CHX|q!OYK;o<X~=Am z%TJbS*Hx)cE-qiE(fo353BcF%Mud5DnkzKF;v44!acY4agibA1L4{MxQlRqddK<#| zsVy1Y>&v-j8UKbdq;1a7yC3vXm=o6FR16ske!jSAB`rS46h2SslxSj3eS*J#&O z=-xEz*Oltt%vVAKZRT=RTeY%QC~ruuYE5BIr*N$%{k|HSUTSNV?DPVPS8=V2m8Yxa ztxG14)jq>!VvtE=w)iEH7EcHS8%b}S2>HfKbyv7%J(i{bfKb+mxCy{Dmqpp`fdJfX zIRd^x$>nyy-JH|w>x*C7NH{lJutp>MVIB_K56S{kwU0;$YA!_o9xk zxI-Hy!%bE&;;E*eu(o-ipoG7wK{`AbRmK|`T5Fdt1esBCWbJh2Y$F(&OAJ>2ik6O& zUtI@T!Q=KesnUs zB$ZQea-^bWN_f;cx6@&89=V8LgS^kF*%cdh(~u#Tm0b!KR{$KAQJPCQ`JeB5MST z^pJAi>VVEC*4cGX`P8I;%NDM1IqLa3>lTrzTX1Nt&OdT+`N#G2vHQsOo?Lc>?CDv| zHcX!A*(>Xv86>G5mBLBHL{%`X)Ynpn4;PxBGgjN{0mnNYNG}E8#7VO|0~&J6NIGfRX(knJ>*Qd+Q9ipFqq>bZtY6Uf{DvDUB&T;+N$-7y>=3#CzUx9WW0+X5 zPOmqrf!kdv7{D8`__X8IV3gl@eJj1Xj%u2F+yg-m1UyoFl$En*k5mI4hzg(q?u2QV z0#Suu8iYG+x__fW{JPkEn+GnH17ySf0!(W6{dTzvD(;)S^Zv(JQhC@Zz_X8W11riz zxu_s-AI>u2E%rXh{6)6RQ0l}XE5ck+T_AEIt zW;fv)m-@k83eG~b6?m1rHdd&JrY?-6n#n)L9))>0@I)r}sB-eTCw{>5OnVjaAIr|k z!@$3L5!9!@PDM3n!Jl6ZndfIuSx^|e_CD|2Y)nf*JUQ%c1ky|kuWsmH*loGS?p0_v*|0lBE+l)#XLPS76L(YEkW1EmtK| zH_Rz-!sJcgx;vRgb;RrBs^SGD*2$sqG*B2XUqFB>yZ)rUV&uqNk^P>%_k=sbIFP>IjPz+kF*cT$(B7;RBZpA&cs(q{=R2n zqG|#C#&a-%IQFih^wGT;^m~yMadsCbakB03phkW^lEvABKwC_{0ef(Me2)cO+uZ$5 zdxM=gc38-Y8A#uX-T}fp>$0T_>nraiUuz9YAw((q%X5WrsU~ypl+beNjiC+ z;<4UPQV`##_zoUV0r|krKvV|DPf)@eQ+h=k6~w_?;Y90C+{e>&c>=k3*D8GgAgcDl zfdwwjyOUnmOW#2b1~fD%qmfip8@Scuhbwk`Fr(R+L?alCFeC$lJ_P$2vg`ZlAw2J{ z@~_=UqC1z+SyDfc@VQ8KAnOCHcZrKe`K z&D^OJ3lOJ@Y5~4(o6}~Hzri-zfOylsQrT7@KV4)6;@4(vn_m8F`JSnI+O|dNGnFdDYmFN;d{pId;9NQ!CH#;H9>oSFc#nri^NN1o zH{BJm8Lll21btEkERrF;AFg72UiV;s1(dl;xVE+u?k~^-rVUO`^rY~)3l>O8eN^di z^!RdJ4u`ZI*W|OnjNio+A^8N70a7xtqM-okSCPh8L@GfN3_{Rug|or#t=(l7B=%j|Y6+pf3vaHb1_pwO9>Pr>nL% zjh1J(FAX0tmXqiv_N?1e?-;tRe| zN-f##%I1j7HdZ=*YdEhCs$01*lYyG;F`ml-g3pX;CU^g6gtNo* z^YjpFW=)?|a%}1+8jjT?NS)T;GsH*RNY#%Qlz-C3E@zD&t>)&iwvX1ay)0b*h&&?` VhKJ7cnZK2(cCYym$Sj)L{y*6)UL61c diff --git a/ingest_eia_energy_layers.py b/ingest_eia_energy_layers.py index c82a6f0..0ff5b62 100644 --- a/ingest_eia_energy_layers.py +++ b/ingest_eia_energy_layers.py @@ -23,6 +23,7 @@ import json import os import re import sys +import time from dataclasses import dataclass from pathlib import Path from typing import List, Optional, Dict, Any @@ -30,6 +31,7 @@ from typing import List, Optional, Dict, Any import psycopg2 import requests from psycopg2 import sql +from psycopg2.extras import execute_values DB_NAME = "data_centers" @@ -51,39 +53,28 @@ SAFE_RE = re.compile(r"[^a-z0-9_]+") # EIA dataset categories mapped to infrastructure types EIA_DATASETS = { - "electric": { - "category": "electric_grid", - "endpoints": [ - "electricity/electric-power-operational-data", - "electricity/rto/region-data", - ], - }, "power": { "category": "power_plants", "endpoints": [ + # Plant-level generation assets with coordinates: what + where. "electricity/operating-generator-capacity", + # Per-plant monthly net + gross generation (Form EIA-923): how much. "electricity/facility-fuel", ], }, - "gas": { - "category": "gas_infrastructure", - "endpoints": [ - "natural-gas/move/ist", - "natural-gas/stor/sum", - "petroleum/stoc", - ], - }, } -# EIA endpoints that support lat/lon data fields for plant-level geocoding -EIA_GEOCODABLE_ENDPOINTS = { +# Extra data fields (the EIA `data[N]=` query params) each endpoint needs. +# operating-generator-capacity returns only id columns by default; latitude/longitude +# must be requested explicitly. facility-fuel returns only id columns; generation +# values must be requested explicitly. +EIA_DATASET_DATA_FIELDS = { "electricity/operating-generator-capacity": ["latitude", "longitude"], - "electricity/facility-fuel": ["latitude", "longitude"], + "electricity/facility-fuel": ["generation", "gross-generation"], } # Endpoints that do not reliably support retry with ad-hoc data[] field requests. EIA_NO_RETRY_EXTRA_FIELDS = { - "electricity/facility-fuel", } # US state abbreviation to FIPS code mapping for state-level GEOID linking @@ -127,92 +118,207 @@ def standardize_table_name(dataset_id: str) -> str: return f"{base[:46]}_{digest}" -def query_eia_api(endpoint: str, params: Optional[Dict[str, Any]] = None, extra_data_fields: Optional[List[str]] = None) -> Optional[Dict]: - """Query EIA API endpoint.""" - # EIA API uses /data/ suffix for data queries +class EIAClientError(Exception): + """Non-retryable EIA API error (e.g. 400 for unsupported fields).""" + + +def iter_months(start: str, end: str): + """Yield 'YYYY-MM' strings from start to end inclusive.""" + sy, sm = (int(x) for x in start.split("-")) + ey, em = (int(x) for x in end.split("-")) + y, m = sy, sm + while (y, m) <= (ey, em): + yield f"{y:04d}-{m:02d}" + m += 1 + if m > 12: + m = 1 + y += 1 + + +def discover_period_range(endpoint: str) -> tuple: + """Return (earliest, latest) 'YYYY-MM' period strings for an endpoint. + + Forces frequency=monthly so endpoints that also publish annual/quarterly + series (e.g. facility-fuel) don't return non-monthly period formats that + break iter_months. Routes through query_eia_api for retry/backoff coverage. + """ + def _one(direction: str) -> str: + data = query_eia_api( + endpoint, + params={ + "length": 1, + "sort[0][column]": "period", + "sort[0][direction]": direction, + }, + query_params={"frequency": "monthly"}, + ) + rows = (data or {}).get("response", {}).get("data", []) + if not rows: + raise RuntimeError(f"no rows returned discovering period range for {endpoint}") + return rows[0]["period"] + + return _one("asc"), _one("desc") + + +def query_eia_api( + endpoint: str, + params: Optional[Dict[str, Any]] = None, + extra_data_fields: Optional[List[str]] = None, + query_params: Optional[Dict[str, Any]] = None, +) -> Optional[Dict]: + """Query EIA API endpoint with retry/backoff on transient errors. + + Returns parsed JSON on success. Raises EIAClientError on 4xx (caller + decides whether to retry without extra fields). Raises requests.RequestException + after exhausting retries on transient errors. + """ if not endpoint.endswith("/data"): endpoint = f"{endpoint}/data" - + url = f"{EIA_API_BASE}/{endpoint}/" - req_params = {"api_key": EIA_API_KEY, "length": 5000} + req_params: Dict[str, Any] = {"api_key": EIA_API_KEY, "length": 5000} + if query_params: + req_params.update(query_params) if params: req_params.update(params) - - # Add extra data fields (e.g., latitude, longitude) using EIA's array syntax + if extra_data_fields: for i, field in enumerate(extra_data_fields): req_params[f"data[{i}]"] = field - - try: - resp = requests.get(url, params=req_params, timeout=(10, 20)) - resp.raise_for_status() - return resp.json() - except requests.RequestException as e: - print(f" api error on {endpoint}: {e}") - return None + + max_attempts = 10 + base_backoff = 5.0 + max_backoff = 120.0 + last_exc: Optional[Exception] = None + for attempt in range(1, max_attempts + 1): + try: + resp = requests.get(url, params=req_params, timeout=(10, 120)) + if 400 <= resp.status_code < 500 and resp.status_code != 429: + raise EIAClientError(f"HTTP {resp.status_code} on {endpoint}: {resp.text[:200]}") + resp.raise_for_status() + return resp.json() + except EIAClientError: + raise + except (requests.Timeout, requests.ConnectionError, requests.HTTPError, ValueError) as e: + last_exc = e + if attempt == max_attempts: + break + sleep_s = min(base_backoff * (2 ** (attempt - 1)), max_backoff) + print(f" api error on {endpoint} (attempt {attempt}/{max_attempts}): {e}; retrying in {sleep_s:.0f}s") + time.sleep(sleep_s) + raise last_exc # type: ignore[misc] -def fetch_eia_records( +def fetch_eia_pages( endpoint: str, max_records: int = 0, extra_data_fields: Optional[List[str]] = None, -) -> Optional[List[Dict[str, Any]]]: - """Fetch EIA records with pagination; retry without extra fields on unsupported endpoints.""" + query_params: Optional[Dict[str, Any]] = None, +) -> Any: + """Yield paged EIA records; retry without extra fields on unsupported endpoints.""" page_size = 5000 offset = 0 - records: List[Dict[str, Any]] = [] + yielded = 0 used_extra_fields = extra_data_fields previous_first_row: Optional[str] = None while True: params = {"offset": offset, "length": page_size} - data = query_eia_api(endpoint, params=params, extra_data_fields=used_extra_fields) - - # Some endpoints return 400 when requesting unsupported fields (e.g. lat/lon). - if ( - data is None - and used_extra_fields - and endpoint not in EIA_NO_RETRY_EXTRA_FIELDS - ): - print(f" retrying {endpoint} without extra data fields") - used_extra_fields = None - data = query_eia_api(endpoint, params=params, extra_data_fields=None) + try: + data = query_eia_api( + endpoint, + params=params, + extra_data_fields=used_extra_fields, + query_params=query_params, + ) + except EIAClientError as e: + if used_extra_fields and endpoint not in EIA_NO_RETRY_EXTRA_FIELDS: + print(f" retrying {endpoint} without extra data fields ({e})") + used_extra_fields = None + data = query_eia_api( + endpoint, + params=params, + extra_data_fields=None, + query_params=query_params, + ) + else: + raise if not data: - return None if offset == 0 else records + return response = data.get("response", {}) page_records = response.get("data", []) if not page_records: - return records + return # Some EIA endpoints ignore offset and repeat page 1 forever. # Detect repeated first row signature and stop pagination. first_row_sig = json.dumps(page_records[0], sort_keys=True, default=str) if previous_first_row is not None and first_row_sig == previous_first_row: - return records + return previous_first_row = first_row_sig - records.extend(page_records) - total = response.get("total") try: total_int = int(total) if total is not None else None except (TypeError, ValueError): total_int = None - if max_records > 0 and len(records) >= max_records: - return records[:max_records] + if max_records > 0: + remaining = max_records - yielded + if remaining <= 0: + return + if len(page_records) > remaining: + page_records = page_records[:remaining] - if total_int is not None and len(records) >= total_int: - return records + yield page_records, used_extra_fields + yielded += len(page_records) + + if max_records > 0 and yielded >= max_records: + return + + if total_int is not None and yielded >= total_int: + return if len(page_records) < page_size: - return records + return offset += len(page_records) +def fetch_eia_pages_by_month( + endpoint: str, + earliest: str, + latest: str, + max_records: int = 0, + extra_data_fields: Optional[List[str]] = None, +) -> Any: + """Yield pages across months, querying one month at a time. + + EIA's bulk endpoints serve large offsets slowly and return frequent 503s + under sustained load. Filtering by &frequency=monthly&start=X&end=X keeps + each query small (~17k–28k rows per month for operating-generator-capacity) + and dramatically reduces failure rate and wall time. + """ + yielded = 0 + for month in iter_months(earliest, latest): + if max_records > 0 and yielded >= max_records: + return + remaining = max_records - yielded if max_records > 0 else 0 + month_params = {"frequency": "monthly", "start": month, "end": month} + for page_records, used_extra_fields in fetch_eia_pages( + endpoint, + max_records=remaining, + extra_data_fields=extra_data_fields, + query_params=month_params, + ): + yield page_records, used_extra_fields, month + yielded += len(page_records) + if max_records > 0 and yielded >= max_records: + return + + def get_eia_datasets(category: str = "all") -> List[EIADataset]: """Discover EIA datasets by category.""" datasets = [] @@ -256,79 +362,91 @@ def import_layer_to_postgis(dataset: EIADataset, table_name: str, max_records: i """Import EIA dataset to PostGIS table.""" conn = connect_db() try: - # Check if this endpoint supports lat/lon geocoding - extra_fields = EIA_GEOCODABLE_ENDPOINTS.get(dataset.api_endpoint) + extra_fields = EIA_DATASET_DATA_FIELDS.get(dataset.api_endpoint) - # Query EIA API for data (with pagination), requesting lat/lon when supported. - records = fetch_eia_records( + earliest, latest = discover_period_range(dataset.api_endpoint) + print(f" period range: {earliest} -> {latest}") + + count = 0 + geo_count = 0 + initialized = False + current_month: Optional[str] = None + + for page_records, used_extra_fields, month in fetch_eia_pages_by_month( dataset.api_endpoint, + earliest=earliest, + latest=latest, max_records=max_records, extra_data_fields=extra_fields, - ) - if not records: - print(f" no data returned") + ): + if month != current_month: + if current_month is not None: + print(f" progress: {count} rows ingested through {current_month}") + current_month = month + if not initialized: + with conn: + with conn.cursor() as cur: + cur.execute( + f""" + CREATE TABLE IF NOT EXISTS public.{table_name} ( + gid SERIAL PRIMARY KEY, + geom GEOMETRY(GEOMETRY, 4326), + properties JSONB + ) + """ + ) + cur.execute(f"TRUNCATE TABLE public.{table_name}") + initialized = True + + geom_rows = [] + prop_rows = [] + for record in page_records: + props_json = json.dumps(record) + lat = record.get("latitude") or record.get("lat") + lon = record.get("longitude") or record.get("lon") + try: + lat = float(lat) if lat is not None else None + lon = float(lon) if lon is not None else None + except (TypeError, ValueError): + lat = lon = None + + if lat is not None and lon is not None and -90 <= lat <= 90 and -180 <= lon <= 180: + geom_rows.append((lon, lat, props_json)) + geo_count += 1 + else: + prop_rows.append((props_json,)) + count += 1 + + with conn: + with conn.cursor() as cur: + if geom_rows: + execute_values( + cur, + f"INSERT INTO public.{table_name} (geom, properties) VALUES %s", + geom_rows, + template="(ST_SetSRID(ST_MakePoint(%s, %s), 4326), %s)", + page_size=1000, + ) + if prop_rows: + execute_values( + cur, + f"INSERT INTO public.{table_name} (properties) VALUES %s", + prop_rows, + template="(%s)", + page_size=1000, + ) + + # Track if API ended up running without extra fields after retry. + if used_extra_fields is None: + extra_fields = None + + if not initialized: + print(" no data returned") return False - # Create target table only once data is confirmed. - with conn: - with conn.cursor() as cur: - cur.execute( - f""" - CREATE TABLE IF NOT EXISTS public.{table_name} ( - gid SERIAL PRIMARY KEY, - geom GEOMETRY(GEOMETRY, 4326), - properties JSONB - ) - """ - ) - # Truncate to avoid duplicates on re-runs - cur.execute(f"TRUNCATE TABLE public.{table_name}") - - # Insert records into PostGIS using psycopg2. - with conn: - with conn.cursor() as cur: - count = 0 - geo_count = 0 - for record in records: - try: - props_json = json.dumps(record) - # Try to extract lat/lon for geometry - lat = record.get("latitude") or record.get("lat") - lon = record.get("longitude") or record.get("lon") - try: - lat = float(lat) if lat is not None else None - lon = float(lon) if lon is not None else None - except (TypeError, ValueError): - lat = lon = None - - if lat is not None and lon is not None and -90 <= lat <= 90 and -180 <= lon <= 180: - cur.execute( - f""" - INSERT INTO public.{table_name} (geom, properties) - VALUES (ST_SetSRID(ST_MakePoint(%s, %s), 4326), %s) - """, - (lon, lat, props_json), - ) - geo_count += 1 - else: - cur.execute( - f""" - INSERT INTO public.{table_name} (properties) - VALUES (%s) - """, - (props_json,), - ) - count += 1 - except Exception as e: - print(f" row insert error: {e}") - continue - geo_msg = f", {geo_count} with geometry" if extra_fields else "" print(f" inserted {count} features into {table_name}{geo_msg}") return count > 0 - except Exception as e: - print(f" error: {e}") - return False finally: try: conn.close() @@ -652,6 +770,153 @@ def build_summary_table(conn): cur.execute(f"analyze {SUMMARY_TABLE}") +def build_flat_tables(conn): + """Create analyst-friendly flat tables from JSON properties.""" + with conn.cursor() as cur: + cur.execute( + """ + select table_name + from information_schema.tables + where table_schema='public' + and table_name in ( + 'energy_eia_electricity_operating_generator_capacity', + 'energy_eia_electricity_facility_fuel' + ) + """ + ) + available = {row[0] for row in cur.fetchall()} + + with conn: + with conn.cursor() as cur: + cur.execute("drop table if exists public.energy_eia_electric_power_operational_data_flat") + cur.execute("drop table if exists public.energy_eia_rto_region_data_flat") + + if "energy_eia_electricity_operating_generator_capacity" in available: + cur.execute("drop table if exists public.energy_eia_operating_generator_capacity_flat") + # EIA stored lower-48 longitudes as positive numbers for periods + # 2008-01 through 2010-11 (~600k rows). The negative sign is + # restored here for any state other than AK (Alaska legitimately + # has Aleutian plants east of the dateline with positive lons). + # geom is rebuilt from the corrected coordinates so the source + # table's pre-correction geometry is discarded. + cur.execute( + r""" + create table public.energy_eia_operating_generator_capacity_flat as + with parsed as ( + select + gid, + properties, + properties->>'stateid' as state_id_raw, + case + when (properties->>'latitude') ~ '^-?[0-9]+(\.[0-9]+)?$' + then (properties->>'latitude')::double precision + end as latitude_raw, + case + when (properties->>'longitude') ~ '^-?[0-9]+(\.[0-9]+)?$' + then (properties->>'longitude')::double precision + end as longitude_raw + from public.energy_eia_electricity_operating_generator_capacity + ), + fixed as ( + select + *, + case + when longitude_raw > 0 and state_id_raw <> 'AK' + then -longitude_raw + else longitude_raw + end as longitude_fixed + from parsed + ) + select + gid, + case + when latitude_raw is not null + and longitude_fixed is not null + and latitude_raw between -90 and 90 + and longitude_fixed between -180 and 180 + then st_setsrid(st_makepoint(longitude_fixed, latitude_raw), 4326) + end as geom, + properties->>'period' as period, + properties->>'plantid' as plant_id, + properties->>'plantName' as plant_name, + state_id_raw as state_id, + properties->>'stateName' as state_name, + properties->>'entityid' as entity_id, + properties->>'entityName' as entity_name, + properties->>'generatorid' as generator_id, + properties->>'status' as status, + properties->>'sector' as sector, + properties->>'sectorName' as sector_name, + properties->>'energy_source_code' as energy_source_code, + properties->>'energy-source-desc' as energy_source_desc, + properties->>'prime_mover_code' as prime_mover_code, + properties->>'balancing_authority_code' as balancing_authority_code, + properties->>'balancing-authority-name' as balancing_authority_name, + latitude_raw as latitude, + longitude_fixed as longitude, + properties as raw_properties + from fixed + """ + ) + cur.execute( + "create index energy_eia_operating_generator_capacity_flat_geom_gix " + "on public.energy_eia_operating_generator_capacity_flat using gist (geom)" + ) + cur.execute( + "create index energy_eia_operating_generator_capacity_flat_plant_id_idx " + "on public.energy_eia_operating_generator_capacity_flat (plant_id)" + ) + cur.execute( + "create index energy_eia_operating_generator_capacity_flat_state_id_idx " + "on public.energy_eia_operating_generator_capacity_flat (state_id)" + ) + cur.execute("analyze public.energy_eia_operating_generator_capacity_flat") + + if "energy_eia_electricity_facility_fuel" in available: + cur.execute("drop table if exists public.energy_eia_facility_fuel_flat") + cur.execute( + r""" + create table public.energy_eia_facility_fuel_flat as + select + gid, + properties->>'period' as period, + coalesce(properties->>'plantCode', properties->>'plantid') as plant_id, + properties->>'plantName' as plant_name, + properties->>'state' as state_id, + properties->>'stateDescription' as state_name, + properties->>'primeMover' as prime_mover_code, + properties->>'primeMoverDescription' as prime_mover_desc, + properties->>'fuel2002' as energy_source_code, + properties->>'fuel2002Description' as energy_source_desc, + case + when (properties->>'generation') ~ '^-?[0-9]+(\.[0-9]+)?$' + then (properties->>'generation')::double precision + else null + end as generation_mwh, + case + when (properties->>'gross-generation') ~ '^-?[0-9]+(\.[0-9]+)?$' + then (properties->>'gross-generation')::double precision + else null + end as gross_generation_mwh, + properties as raw_properties + from public.energy_eia_electricity_facility_fuel + """ + ) + cur.execute( + "create index energy_eia_facility_fuel_flat_plant_id_idx " + "on public.energy_eia_facility_fuel_flat (plant_id)" + ) + cur.execute( + "create index energy_eia_facility_fuel_flat_period_idx " + "on public.energy_eia_facility_fuel_flat (period)" + ) + cur.execute( + "create index energy_eia_facility_fuel_flat_state_id_idx " + "on public.energy_eia_facility_fuel_flat (state_id)" + ) + cur.execute("analyze public.energy_eia_facility_fuel_flat") + + def prune_stale_layer_versions(conn) -> int: """Drop superseded EIA layer tables and remove stale catalog rows. @@ -718,6 +983,90 @@ def prune_stale_layer_versions(conn) -> int: return pruned +def prune_unselected_layers(conn, selected_table_names: List[str]) -> int: + """Drop catalog/table entries that are not in the currently selected dataset set.""" + selected = set(selected_table_names) + with conn.cursor() as cur: + cur.execute( + """ + select table_name + from public.energy_atlas_layers_catalog + where table_name like 'energy_eia_%' + """ + ) + existing = [row[0] for row in cur.fetchall()] + + to_remove = [name for name in existing if name not in selected] + removed = 0 + + with conn: + with conn.cursor() as cur: + for table_name in to_remove: + cur.execute( + """ + select exists ( + select 1 + from information_schema.tables + where table_schema='public' and table_name=%s + ) + """, + (table_name,), + ) + table_exists = cur.fetchone()[0] + + if table_exists: + cur.execute( + sql.SQL("drop table if exists public.{} cascade").format( + sql.Identifier(table_name) + ) + ) + print(f"pruned unselected table public.{table_name}") + + cur.execute( + "delete from public.energy_atlas_layers_catalog where table_name = %s", + (table_name,), + ) + removed += 1 + + return removed + + +FINAL_FLAT_TABLES = ( + "energy_eia_operating_generator_capacity_flat", + "energy_eia_facility_fuel_flat", +) + + +def keep_only_target_flat_table(conn) -> int: + """Drop all energy_eia_* tables except the final flat tables.""" + with conn.cursor() as cur: + cur.execute( + """ + select table_name + from information_schema.tables + where table_schema='public' + and table_name like 'energy_eia_%%' + and table_name <> ALL(%s) + """, + (list(FINAL_FLAT_TABLES),), + ) + to_drop = [row[0] for row in cur.fetchall()] + + dropped = 0 + with conn: + with conn.cursor() as cur: + for table_name in to_drop: + cur.execute( + sql.SQL("drop table if exists public.{} cascade").format( + sql.Identifier(table_name) + ) + ) + print(f"dropped non-target table public.{table_name}") + dropped += 1 + + return dropped + + def parse_args(): """Parse command-line arguments.""" parser = argparse.ArgumentParser( @@ -727,8 +1076,8 @@ def parse_args(): ) parser.add_argument( "--category", - choices=["electric", "power", "gas", "all"], - default="all", + choices=["power", "all"], + default="power", help="Infrastructure category to ingest.", ) parser.add_argument( @@ -737,6 +1086,16 @@ def parse_args(): default=0, help="Cap on API records to process per dataset (0=all).", ) + parser.add_argument( + "--endpoint", + action="append", + default=None, + help=( + "Limit ingest to specific EIA endpoint(s). " + "Repeatable. Substring match against api_endpoint. " + "Other datasets are skipped (not re-ingested, not pruned)." + ), + ) parser.add_argument( "--skip-ingest", action="store_true", @@ -775,6 +1134,8 @@ def main(): # Build ingest list with table names datasets_to_ingest = [] for dataset in datasets: + if args.endpoint and not any(filt in dataset.api_endpoint for filt in args.endpoint): + continue table_name = standardize_table_name(dataset.dataset_id) datasets_to_ingest.append((dataset, table_name, dataset.category)) @@ -793,69 +1154,49 @@ def main(): if not args.skip_ingest: for dataset, table_name, category in datasets_to_ingest: - try: - print(f"importing {dataset.name} -> public.{table_name} [{category}]") - success = import_layer_to_postgis(dataset, table_name, max_records=args.max_records) - if success: - upsert_layer_catalog(conn, table_name, dataset, category) - add_geom_index_and_analyze(conn, table_name) - except Exception as e: - print(f" warning: import failed ({type(e).__name__}); skipping") - continue + print(f"importing {dataset.name} -> public.{table_name} [{category}]") + success = import_layer_to_postgis(dataset, table_name, max_records=args.max_records) + if success: + upsert_layer_catalog(conn, table_name, dataset, category) + add_geom_index_and_analyze(conn, table_name) - if not args.keep_stale_tables: - pruned = prune_stale_layer_versions(conn) - if pruned > 0: - print(f"pruned stale layer versions: {pruned}") - - # Rebuild GEOID links from catalog. - with conn.cursor() as cur: - cur.execute( - """ - with ranked as ( - select - c.table_name, - c.category, - row_number() over ( - partition by coalesce( - nullif(regexp_replace(c.source_url, '/data/?$', ''), ''), - nullif(c.source_item_id, ''), - c.table_name - ) - order by c.imported_at desc, c.table_name desc - ) as rn - from public.energy_atlas_layers_catalog c - join information_schema.tables t - on t.table_schema = 'public' - and t.table_name = c.table_name - ) - select table_name, category - from ranked - where rn = 1 - order by table_name - """ - ) - catalog_rows = cur.fetchall() + # Pruning compares against the *full* selected set; skip when --endpoint + # is narrowing the run, otherwise we'd drop catalog entries for endpoints + # we deliberately chose not to touch. + if not args.endpoint: + selected_table_names = [table_name for _, table_name, _ in datasets_to_ingest] + removed = prune_unselected_layers(conn, selected_table_names) + if removed > 0: + print(f"pruned unselected layers: {removed}") - reset_link_tables(conn) - for table_name, category in catalog_rows: - print(f"linking public.{table_name} -> GEOID ({category})") - link_one_table(conn, table_name, category) + if not args.keep_stale_tables: + pruned = prune_stale_layer_versions(conn) + if pruned > 0: + print(f"pruned stale layer versions: {pruned}") - build_summary_table(conn) + build_flat_tables(conn) + dropped_non_target = keep_only_target_flat_table(conn) + if dropped_non_target > 0: + print(f"dropped non-target energy tables: {dropped_non_target}") with conn.cursor() as cur: cur.execute("select count(*) from public.energy_atlas_layers_catalog") catalog_count = cur.fetchone()[0] - cur.execute(f"select count(*) from {LINK_TABLE}") - link_count = cur.fetchone()[0] - cur.execute(f"select count(*) from {SUMMARY_TABLE}") - summary_count = cur.fetchone()[0] + counts = {} + for tbl in FINAL_FLAT_TABLES: + cur.execute( + "select to_regclass(%s) is not null", + (f"public.{tbl}",), + ) + if cur.fetchone()[0]: + cur.execute(f"select count(*) from public.{tbl}") + counts[tbl] = cur.fetchone()[0] + else: + counts[tbl] = None - print( - f"\ndone: catalog_layers={catalog_count}, " - f"geoid_links={link_count}, geoid_summary_rows={summary_count}" - ) + print(f"\ndone: catalog_layers={catalog_count}") + for tbl, n in counts.items(): + print(f" {tbl}: {n if n is not None else 'missing'} rows") finally: conn.close() diff --git a/output/facility_fuel_pending_narrative.txt b/output/facility_fuel_pending_narrative.txt new file mode 100644 index 0000000..254942d --- /dev/null +++ b/output/facility_fuel_pending_narrative.txt @@ -0,0 +1,132 @@ +================================================================================ +EIA Facility-Fuel — Pending Dataset Narrative +Drafted 2026-05-16, prior to first successful ingest +================================================================================ + +STATUS +------ +Wired into the weekly ingest pipeline as of 2026-05-16, but not yet +populated. EIA's facility-fuel endpoint and its parent EIA-923 service +were experiencing a sustained outage at write time (network-level +connection timeouts, also visible on EIA's public dashboard). The +endpoint is queued for the next successful systemd run (Monday 03:30, +or sooner if EIA recovers). + +Target table when populated: public.energy_eia_facility_fuel_flat + +WHAT THIS DATA IS +----------------- +The "facility-fuel" endpoint +(https://api.eia.gov/v2/electricity/facility-fuel/) exposes Form EIA-923: +the monthly survey collected from electric power plants reporting their +fuel consumption and electricity output. Where operating-generator-capacity +tells us WHAT generators exist and WHERE they are, facility-fuel tells us +HOW MUCH electricity each plant actually produced each month. + +Each row represents one (plant × energy source × prime mover × month) +combination. A coal-gas hybrid plant with both steam turbines and +combustion turbines, for example, would have multiple rows per month — +one for each fuel/prime-mover combination it ran during that month. + +WHAT IT TELLS US (PLANNED COLUMNS) +---------------------------------- +For each plant, in each reporting month: + + period YYYY-MM reporting month + plant_id EIA plant code — joins to operating_generator_capacity_flat + plant_name Plant name (when present) + state_id Two-letter state + state_name Full state name (when present) + prime_mover_code ST=steam, CT=combustion, HY=hydro, etc. + prime_mover_desc Human-readable prime mover + energy_source_code EIA fuel code (e.g., NG=natural gas, BIT=bituminous coal) + energy_source_desc Human-readable fuel + generation_mwh NET generation in megawatt-hours (after plant use) + gross_generation_mwh GROSS generation in megawatt-hours (at the busbar) + raw_properties Full JSONB of the EIA response row (safety net) + +The two MWh fields are the headline numbers — actual electricity output. + +WHY BOTH TABLES MATTER +---------------------- +The capacity table answers "what generators exist and where," but a +generator that exists is not the same as a generator that produces. A +1,000 MW coal plant in standby status produces zero MWh; a 100 MW solar +farm at noon produces near its nameplate. Capacity sets the upper bound; +facility-fuel reports the realized output. + +For data-center analyses specifically, this matters because: + + - Siting decisions correlate with available local generation. The + capacity table shows nearby supply potential. The facility-fuel + table shows whether that potential is actually being realized + month-to-month (e.g., a nearby gas plant that runs only as peaker + is a very different story from one running baseload). + + - Carbon intensity per data center can be estimated by attributing + nearby generation MWh to fuel type, weighted by distance or + balancing-authority membership. + + - Grid stress signals (capacity utilization = generation / capacity) + flag regions where new data-center load may be unwelcome. + +JOIN PATTERN +------------ +The natural join key is plant_id (text). Typical analyst query: + + select + cap.plant_name, + cap.state_id, + cap.entity_name, + cap.latitude, + cap.longitude, + ff.period, + ff.energy_source_desc, + ff.generation_mwh, + ff.gross_generation_mwh + from public.energy_eia_facility_fuel_flat ff + join public.energy_eia_operating_generator_capacity_flat cap + on cap.plant_id = ff.plant_id + and cap.period = ff.period + where ff.period = '2026-01'; + +Note: capacity rows are per-generator; facility-fuel rows are per +plant × fuel × prime mover. A join on plant_id alone will multiply rows. +For most aggregate questions, aggregate one side first (e.g., sum MWh +per plant-month, or pick a representative generator per plant). + +EXPECTED SIZE +------------- +Form EIA-923 monthly publishes back to 2001-01. With ~10,000 reporting +plants and multiple fuel/prime-mover combinations per plant per month, +the table is expected in the 5–10 million row range — similar to or +somewhat larger than the capacity table. The per-month ingest strategy +(start=YYYY-MM&end=YYYY-MM, retry/backoff) is identical to the capacity +ingest and was chosen specifically because it kept that table's wall +time near two hours and recovered cleanly from EIA's transient 503s. + +UNKNOWNS AT TIME OF DRAFT +------------------------- +The flat-table SELECT was written from EIA's API documentation without +confirmation of the exact JSON key casing returned by the live endpoint +(the documentation lists facets as plantCode, fuel2002, primeMover, state +— the SELECT uses these names). If the live response differs (e.g., +plantid vs plantCode), the typed columns will populate as NULL for +those rows, and the full original payload will still be available in +raw_properties for inspection. The fix in that case is a one-line edit +to the SELECT in build_flat_tables() in ingest_eia_energy_layers.py. + +OPERATIONAL NOTES +----------------- + - Runs in the same weekly systemd job as operating-generator-capacity, + sequentially after it (Monday 03:30 via + ingest-eia-energy-layers.timer). + + - Both tables are rebuilt from scratch each run (TRUNCATE on first + page), so historical revisions EIA pushes upstream propagate + automatically. There is no incremental-load mode and none is + planned — total wall time is acceptable. + + - If EIA-923 is down at run time, the wrapper's `set -e` will mark + the systemd service as failed; the capacity ingest will still have + completed successfully because it runs first. diff --git a/output/operating_generator_capacity_sample.txt b/output/operating_generator_capacity_sample.txt new file mode 100644 index 0000000..4163e49 --- /dev/null +++ b/output/operating_generator_capacity_sample.txt @@ -0,0 +1,134 @@ +================================================================================ +EIA Operating Generator Capacity — Sample Rows + Narrative +Generated 2026-05-16 from public.energy_eia_operating_generator_capacity_flat +================================================================================ + +WHAT THIS DATA IS +----------------- +This table is a flat, queryable view of EIA's "operating-generator-capacity" +endpoint (https://api.eia.gov/v2/electricity/operating-generator-capacity/). +The underlying source is Form EIA-860, which inventories every electric +generator in the United States that is reported as operating (or recently +operating) by its owner. + +Each row represents one generator's reported status in one month. A single +power plant typically has multiple generators, so a plant like Plant Barry in +Alabama appears as several rows per month — one for each generator unit +(generator_id 1, 2, 3, ...). The same generator reappears every month it +remains in the inventory, so the table is a time series of (plant × generator +× month) records. + +WHAT IT TELLS US +---------------- +For each generator, in each reporting month: + - Where it is (state, balancing authority, exact latitude/longitude) + - Who owns or operates it (entity_id, entity_name) + - What fuel/energy source it uses (energy_source_code + descriptive name) + - How it generates electricity (prime_mover_code, e.g. ST=steam turbine, + HY=hydro, IC=internal combustion, WT=wind turbine) + - Its current operating status (status code, see below) + - What sector it serves (utility, IPP, industrial, commercial, etc.) + +What it does NOT tell us is how much electricity the generator actually +produces in that month — that data comes from a separate EIA endpoint +("facility-fuel", Form EIA-923), captured in a sibling table. + +STATUS CODES IN THIS TABLE +-------------------------- + OP Operating 4,229,083 rows + SB Standby / backup 339,057 rows + OS Out of service 99,816 rows + OA Out of service (annual) 28,769 rows + +SUMMARY STATISTICS +------------------ + Total rows: 4,696,725 + Distinct generators (by plant_id × generator_id): ~75k + Distinct plants (plant_id): 15,791 + Distinct states/territories: 51 + Distinct months covered: 218 + Period range: 2008-01 → 2026-02 + Rows with lat/lon geometry: 4,685,500 (99.76%) + Distinct fuel codes: 38 + +TOP 10 FUELS BY ROW COUNT +------------------------- + Natural Gas 1,301,782 + Water (hydro) 908,741 + Distillate Fuel Oil* 767,207 + Solar 624,113 + Landfill Gas 317,709 + Wind 245,214 + Bituminous Coal 108,352 + Subbituminous Coal 75,587 + Electricity used for energy storage 43,833 + Geothermal 41,066 + + * EIA stores this as "Disillate Fuel Oil" (sic). The misspelling is in + EIA's source data, not introduced by ingest. Preserved verbatim. + +FIRST 5 ROWS (earliest period, ordered by plant_id) +--------------------------------------------------- + period | plant_id | plant_name | state | entity_name | gen_id | status | fuel | pm | latitude | longitude +---------+----------+--------------+-------+------------------+--------+--------+------------------+----+-----------+----------- + 2008-01 | 2 | Bankhead Dam | AL | Alabama Power Co | 1 | OP | Water | HY | 33.218889 | -87.579722 + 2008-01 | 3 | Barry | AL | Alabama Power Co | 1 | OP | Bituminous Coal | ST | 31.004167 | -88.013889 + 2008-01 | 3 | Barry | AL | Alabama Power Co | 2 | OP | Bituminous Coal | ST | 31.004167 | -88.013889 + 2008-01 | 3 | Barry | AL | Alabama Power Co | 3 | OP | Bituminous Coal | ST | 31.004167 | -88.013889 + 2008-01 | 3 | Barry | AL | Alabama Power Co | 4 | OP | Bituminous Coal | ST | 31.004167 | -88.013889 + +(Both plants are in Alabama; Bankhead Dam is a hydro facility on the Black +Warrior River, Plant Barry is a coal-fired steam plant near Mobile. Both +were operating in January 2008.) + +LAST 5 ROWS (latest period, ordered by plant_id) +------------------------------------------------ + period | plant_id | plant_name | state | entity_name | gen_id | status | fuel | pm | latitude | longitude +---------+----------+------------+-------+----------------------------+--------+--------+---------------------+----+-----------+------------- + 2026-02 | 1 | Sand Point | AK | Sand Point Generating, LLC | 1 | SB | Disillate Fuel Oil | IC | 55.339722 | -160.497222 + 2026-02 | 1 | Sand Point | AK | Sand Point Generating, LLC | 2 | OP | Disillate Fuel Oil | IC | 55.339722 | -160.497222 + 2026-02 | 1 | Sand Point | AK | Sand Point Generating, LLC | 3 | OP | Disillate Fuel Oil | IC | 55.339722 | -160.497222 + 2026-02 | 1 | Sand Point | AK | Sand Point Generating, LLC | 5.1 | OP | Disillate Fuel Oil | IC | 55.339722 | -160.497222 + 2026-02 | 1 | Sand Point | AK | Sand Point Generating, LLC | WT1 | OS | Wind | WT | 55.339722 | -160.497222 + +(Sand Point is a small remote-Alaska community station with five generators: +four diesel internal-combustion units and one wind turbine. The wind turbine +is currently out of service.) + +KNOWN DATA-QUALITY QUIRKS IN EIA'S SOURCE DATA +---------------------------------------------- + - Historical longitude sign bug (FIXED at ingest time, 2026-05-16). + For reporting periods 2008-01 through 2010-11, EIA stored lower-48 + longitudes as positive numbers (Bankhead Dam was +87.579722 instead + of -87.579722). EIA cleaned this up in their own data starting + 2010-12, but the historical periods still had the bug. The flat + table's build step now applies: + + CASE WHEN longitude > 0 AND state_id <> 'AK' + THEN -longitude ELSE longitude END + + and rebuilds geom from the corrected coordinates. Alaska is + excluded because some Aleutian plants (~11k bug-era rows) are + legitimately east of the dateline with positive longitudes. + Affected non-AK rows fixed: 403,558. After the fix, every plant + in the table is at a geographically plausible US location. + + - Fuel description "Disillate Fuel Oil" (missing 't', should be + "Distillate") — EIA's spelling, preserved as-is in energy_source_desc. + +REFRESH CADENCE +--------------- +A systemd user timer rebuilds this table every Monday at 03:30 local time +via ~/.local/bin/ingest-eia-energy-layers-weekly. The ingest fetches the +full dataset per month (Jan 2008 → current) and rebuilds the flat table +from scratch each run. + +JOIN KEY FOR DOWNSTREAM ANALYSIS +-------------------------------- +plant_id (text) joins to the forthcoming energy_eia_facility_fuel_flat +table (Form EIA-923), which provides monthly net + gross generation in MWh +for the same plants. Together, the two tables answer: + + - WHERE energy is generated (this table, with lat/lon) + - WHAT is generated and by whom (this table, with fuel + entity) + - HOW MUCH is generated each month (facility_fuel_flat, in MWh)